File size: 82,289 Bytes
00df61d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 | /* ----------------------------------------------------------------------------
Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
This is free software; you can redistribute it and/or modify it under the
terms of the MIT license. A copy of the license can be found in the file
"LICENSE" at the root of this distribution.
-----------------------------------------------------------------------------*/
/* ----------------------------------------------------------------------------
Concurrent bitmap that can set/reset sequences of bits atomically
---------------------------------------------------------------------------- */
#include "mimalloc.h"
#include "mimalloc/internal.h"
#include "mimalloc/bits.h"
#include "mimalloc/prim.h" // _mi_prim_thread_yield
#include "bitmap.h"
#ifndef MI_OPT_SIMD
#define MI_OPT_SIMD 0
#endif
/* --------------------------------------------------------------------------------
bfields
-------------------------------------------------------------------------------- */
static inline size_t mi_bfield_ctz(mi_bfield_t x) {
return mi_ctz(x);
}
static inline size_t mi_bfield_clz(mi_bfield_t x) {
return mi_clz(x);
}
static inline size_t mi_bfield_popcount(mi_bfield_t x) {
return mi_popcount(x);
}
static inline mi_bfield_t mi_bfield_clear_least_bit(mi_bfield_t x) {
return (x & (x-1));
}
// find the least significant bit that is set (i.e. count trailing zero's)
// return false if `x==0` (with `*idx` undefined) and true otherwise,
// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
return mi_bsf(x,idx);
}
// find the most significant bit that is set.
// return false if `x==0` (with `*idx` undefined) and true otherwise,
// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) {
return mi_bsr(x, idx);
}
// find each set bit in a bit field `x` and clear it, until it becomes zero.
static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
const bool found = mi_bfield_find_least_bit(*x, idx);
*x = mi_bfield_clear_least_bit(*x);
return found;
}
static inline mi_bfield_t mi_bfield_zero(void) {
return 0;
}
static inline mi_bfield_t mi_bfield_one(void) {
return 1;
}
static inline mi_bfield_t mi_bfield_all_set(void) {
return ~((mi_bfield_t)0);
}
// mask of `bit_count` bits set shifted to the left by `shiftl`
static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
mi_assert_internal(bit_count > 0);
mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS);
mi_assert_internal(shiftl < MI_BFIELD_BITS);
const mi_bfield_t mask0 = (bit_count < MI_BFIELD_BITS ? (mi_bfield_one() << bit_count)-1 : mi_bfield_all_set());
return (mask0 << shiftl);
}
// ------- mi_bfield_atomic_set ---------------------------------------
// the `_set` functions return also the count of bits that were already set (for commit statistics)
// the `_clear` functions return also whether the new bfield is all clear or not (for the chunk_map)
// Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
mi_assert_internal(idx < MI_BFIELD_BITS);
const mi_bfield_t mask = mi_bfield_mask(1, idx);;
const mi_bfield_t old = mi_atomic_or_acq_rel(b, mask);
return ((old&mask) == 0);
}
// Clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
// `all_clear` is set if the new bfield is zero.
static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
mi_assert_internal(idx < MI_BFIELD_BITS);
const mi_bfield_t mask = mi_bfield_mask(1, idx);;
mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
return ((old&mask) == mask);
}
// Clear a bit but only when/once it is set. This is used by concurrent free's while
// the page is abandoned and mapped. This can incure a busy wait :-( but it should
// happen almost never (and is accounted for in the stats)
static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) {
mi_assert_internal(idx < MI_BFIELD_BITS);
const mi_bfield_t mask = mi_bfield_mask(1, idx);;
mi_bfield_t old = mi_atomic_load_relaxed(b);
do {
if mi_unlikely((old&mask) == 0) {
old = mi_atomic_load_acquire(b);
if ((old&mask)==0) {
mi_subproc_stat_counter_increase(_mi_subproc(), pages_unabandon_busy_wait, 1);
}
while ((old&mask)==0) { // busy wait
_mi_prim_thread_yield();
old = mi_atomic_load_acquire(b);
}
}
} while (!mi_atomic_cas_weak_acq_rel(b,&old, (old&~mask)));
mi_assert_internal((old&mask)==mask); // we should only clear when it was set
}
// Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
// `already_set` contains the count of bits that were already set (used when committing ranges to account
// statistics correctly).
static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) {
mi_assert_internal(mask != 0);
mi_bfield_t old = mi_atomic_load_relaxed(b);
while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) {}; // try to atomically set the mask bits until success
if (already_set!=NULL) { *already_set = mi_bfield_popcount(old&mask); }
return ((old&mask) == 0);
}
// Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's
// `all_clear` is set to `true` if the new bfield became zero.
static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
mi_assert_internal(mask != 0);
mi_bfield_t old = mi_atomic_load_relaxed(b);
while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) {}; // try to atomically clear the mask bits until success
if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
return ((old&mask) == mask);
}
static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already_set) {
const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_all_set());
if (already_set!=NULL) { *already_set = mi_bfield_popcount(old); }
return (old==0);
}
// static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
// const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero());
// if (all_clear!=NULL) { *all_clear = true; }
// return (~old==0);
// }
// ------- mi_bfield_atomic_try_clear ---------------------------------------
// Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
// and false otherwise (leaving the bit field as is).
// `all_clear` is set to `true` if the new bfield became zero.
static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) {
mi_assert_internal(mask != 0);
// try to atomically clear the mask bits
do {
if ((expect & mask) != mask) { // are all bits still set?
if (all_clear != NULL) { *all_clear = (expect == 0); }
return false;
}
} while (!mi_atomic_cas_weak_acq_rel(b, &expect, expect & ~mask));
if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0); }
return true;
}
static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear) {
mi_assert_internal(mask != 0);
const mi_bfield_t expect = mi_atomic_load_relaxed(b);
return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear);
}
// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0
// and `false` otherwise leaving the bfield `b` as-is.
// `all_clear` is set to true if the new bfield became zero (and false otherwise)
mi_decl_maybe_unused static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) {
mi_assert_internal(idx < MI_BFIELD_BITS);
const mi_bfield_t mask = mi_bfield_one()<<idx;
return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
}
// Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0
// `all_clear` is set to true if the new bfield became zero (and false otherwise)
mi_decl_maybe_unused static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
mi_assert_internal(idx < MI_BFIELD_BITS);
mi_assert_internal((idx%8)==0);
const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<idx;
return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
}
// Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
// and false otherwise leaving the bit field as-is.
// `all_clear` is set to true if the new bfield became zero (which is always the case if successful).
static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
mi_bfield_t old = mi_bfield_all_set();
if (mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero())) {
if (all_clear != NULL) { *all_clear = true; }
return true;
}
else return false;
}
// ------- mi_bfield_atomic_is_set ---------------------------------------
// Check if a bit is set
static inline bool mi_bfield_atomic_is_set(const _Atomic(mi_bfield_t)*b, const size_t idx) {
const mi_bfield_t x = mi_atomic_load_acquire(b);
return ((x & mi_bfield_mask(1,idx)) != 0);
}
// Check if a bit is clear
static inline bool mi_bfield_atomic_is_clear(const _Atomic(mi_bfield_t)*b, const size_t idx) {
const mi_bfield_t x = mi_atomic_load_acquire(b);
return ((x & mi_bfield_mask(1, idx)) == 0);
}
// Check if a bit is xset
static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, const _Atomic(mi_bfield_t)*b, const size_t idx) {
if (set) return mi_bfield_atomic_is_set(b, idx);
else return mi_bfield_atomic_is_clear(b, idx);
}
// Check if all bits corresponding to a mask are set.
static inline bool mi_bfield_atomic_is_set_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
mi_assert_internal(mask != 0);
const mi_bfield_t x = mi_atomic_load_acquire(b);
return ((x & mask) == mask);
}
// Check if all bits corresponding to a mask are clear.
static inline bool mi_bfield_atomic_is_clear_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
mi_assert_internal(mask != 0);
const mi_bfield_t x = mi_atomic_load_acquire(b);
return ((x & mask) == 0);
}
// Check if all bits corresponding to a mask are set/cleared.
static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
mi_assert_internal(mask != 0);
if (set) return mi_bfield_atomic_is_set_mask(b, mask);
else return mi_bfield_atomic_is_clear_mask(b, mask);
}
// Count bits in a mask
static inline size_t mi_bfield_atomic_popcount_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
const mi_bfield_t x = mi_atomic_load_acquire(b);
return mi_bfield_popcount(x & mask);
}
/* --------------------------------------------------------------------------------
bitmap chunks
-------------------------------------------------------------------------------- */
// ------- mi_bchunk_set ---------------------------------------
// Set a single bit
static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) {
mi_assert_internal(cidx < MI_BCHUNK_BITS);
const size_t i = cidx / MI_BFIELD_BITS;
const size_t idx = cidx % MI_BFIELD_BITS;
const bool was_clear = mi_bfield_atomic_set(&chunk->bfields[i], idx);
if (already_set != NULL) { *already_set = (was_clear ? 0 : 1); }
return was_clear;
}
// Set `0 < n <= MI_BFIELD_BITS`, and return true of the mask bits transitioned from all 0's to 1's.
// `already_set` contains the count of bits that were already set (used when committing ranges to account
// statistics correctly).
// Can cross over two bfields.
static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
mi_assert_internal(cidx < MI_BCHUNK_BITS);
mi_assert_internal(n > 0 && n <= MI_BFIELD_BITS);
const size_t i = cidx / MI_BFIELD_BITS;
const size_t idx = cidx % MI_BFIELD_BITS;
if mi_likely(idx + n <= MI_BFIELD_BITS) {
// within one field
return mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(n,idx), already_set);
}
else {
// spanning two fields
const size_t m = MI_BFIELD_BITS - idx; // bits to clear in the first field
mi_assert_internal(m < n);
mi_assert_internal(i < MI_BCHUNK_FIELDS - 1);
mi_assert_internal(idx + m <= MI_BFIELD_BITS);
size_t already_set1;
const bool all_set1 = mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &already_set1);
mi_assert_internal(n - m > 0);
mi_assert_internal(n - m < MI_BFIELD_BITS);
size_t already_set2;
const bool all_set2 = mi_bfield_atomic_set_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &already_set2);
if (already_set != NULL) { *already_set = already_set1 + already_set2; }
return (all_set1 && all_set2);
}
}
// Set a sequence of `n` bits within a chunk.
// Returns true if all bits transitioned from 0 to 1 (or 1 to 0).
mi_decl_noinline static bool mi_bchunk_xsetNC(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_set, bool* pmaybe_all_clear) {
mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
mi_assert_internal(n>0);
bool all_transition = true;
bool maybe_all_clear = true;
size_t total_already_set = 0;
size_t idx = cidx % MI_BFIELD_BITS;
size_t field = cidx / MI_BFIELD_BITS;
while (n > 0) {
size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field
if (m > n) { m = n; }
mi_assert_internal(idx + m <= MI_BFIELD_BITS);
mi_assert_internal(field < MI_BCHUNK_FIELDS);
const mi_bfield_t mask = mi_bfield_mask(m, idx);
size_t already_set = 0;
bool all_clear = false;
const bool transition = (set ? mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, &already_set)
: mi_bfield_atomic_clear_mask(&chunk->bfields[field], mask, &all_clear));
mi_assert_internal((transition && already_set == 0) || (!transition && already_set > 0));
all_transition = all_transition && transition;
total_already_set += already_set;
maybe_all_clear = maybe_all_clear && all_clear;
// next field
field++;
idx = 0;
mi_assert_internal(m <= n);
n -= m;
}
if (palready_set!=NULL) { *palready_set = total_already_set; }
if (pmaybe_all_clear!=NULL) { *pmaybe_all_clear = maybe_all_clear; }
return all_transition;
}
static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
if (n==1) return mi_bchunk_set(chunk, cidx, already_set);
// if (n==8 && (cidx%8) == 0) return mi_bchunk_set8(chunk, cidx, already_set);
// if (n==MI_BFIELD_BITS) return mi_bchunk_setX(chunk, cidx, already_set);
if (n<=MI_BFIELD_BITS) return mi_bchunk_setNX(chunk, cidx, n, already_set);
return mi_bchunk_xsetNC(MI_BIT_SET, chunk, cidx, n, already_set, NULL);
}
// ------- mi_bchunk_clear ---------------------------------------
static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* all_clear) {
mi_assert_internal(cidx < MI_BCHUNK_BITS);
const size_t i = cidx / MI_BFIELD_BITS;
const size_t idx = cidx % MI_BFIELD_BITS;
return mi_bfield_atomic_clear(&chunk->bfields[i], idx, all_clear);
}
static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
if (n==1) return mi_bchunk_clear(chunk, cidx, maybe_all_clear);
// if (n==8) return mi_bchunk_clear8(chunk, cidx, maybe_all_clear);
// if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, maybe_all_clear);
// TODO: implement mi_bchunk_xsetNX instead of setNX
return mi_bchunk_xsetNC(MI_BIT_CLEAR, chunk, cidx, n, NULL, maybe_all_clear);
}
// Check if a sequence of `n` bits within a chunk are all set/cleared.
// This can cross bfield's
mi_decl_noinline static size_t mi_bchunk_popcountNC(mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS);
size_t count = 0;
while (n > 0) {
size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field
if (m > n) { m = n; }
mi_assert_internal(idx + m <= MI_BFIELD_BITS);
mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
const size_t mask = mi_bfield_mask(m, idx);
count += mi_bfield_atomic_popcount_mask(&chunk->bfields[field_idx], mask);
// next field
field_idx++;
idx = 0;
n -= m;
}
return count;
}
// Count set bits a sequence of `n` bits.
static inline size_t mi_bchunk_popcountN(mi_bchunk_t* chunk, size_t cidx, size_t n) {
mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
mi_assert_internal(n>0);
if (n==0) return 0;
const size_t i = cidx / MI_BFIELD_BITS;
const size_t idx = cidx % MI_BFIELD_BITS;
if (n==1) { return (mi_bfield_atomic_is_set(&chunk->bfields[i], idx) ? 1 : 0); }
if (idx + n <= MI_BFIELD_BITS) { return mi_bfield_atomic_popcount_mask(&chunk->bfields[i], mi_bfield_mask(n, idx)); }
return mi_bchunk_popcountNC(chunk, i, idx, n);
}
// ------- mi_bchunk_is_xset ---------------------------------------
// Check if a sequence of `n` bits within a chunk are all set/cleared.
// This can cross bfield's
mi_decl_noinline static bool mi_bchunk_is_xsetNC(mi_xset_t set, const mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS);
while (n > 0) {
size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field
if (m > n) { m = n; }
mi_assert_internal(idx + m <= MI_BFIELD_BITS);
mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
const size_t mask = mi_bfield_mask(m, idx);
if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field_idx], mask)) {
return false;
}
// next field
field_idx++;
idx = 0;
n -= m;
}
return true;
}
// Check if a sequence of `n` bits within a chunk are all set/cleared.
static inline bool mi_bchunk_is_xsetN(mi_xset_t set, const mi_bchunk_t* chunk, size_t cidx, size_t n) {
mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
mi_assert_internal(n>0);
if (n==0) return true;
const size_t i = cidx / MI_BFIELD_BITS;
const size_t idx = cidx % MI_BFIELD_BITS;
if (n==1) { return mi_bfield_atomic_is_xset(set, &chunk->bfields[i], idx); }
if (idx + n <= MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); }
return mi_bchunk_is_xsetNC(set, chunk, i, idx, n);
}
// ------- mi_bchunk_try_clear ---------------------------------------
// Clear `0 < n <= MI_BITFIELD_BITS`. Can cross over a bfield boundary.
static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
mi_assert_internal(cidx < MI_BCHUNK_BITS);
mi_assert_internal(n <= MI_BFIELD_BITS);
const size_t i = cidx / MI_BFIELD_BITS;
const size_t idx = cidx % MI_BFIELD_BITS;
if mi_likely(idx + n <= MI_BFIELD_BITS) {
// within one field
return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(n, idx), pmaybe_all_clear);
}
else {
// spanning two fields (todo: use double-word atomic ops?)
const size_t m = MI_BFIELD_BITS - idx; // bits to clear in the first field
mi_assert_internal(m < n);
mi_assert_internal(i < MI_BCHUNK_FIELDS - 1);
bool field1_is_clear;
if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &field1_is_clear)) return false;
// try the second field as well
mi_assert_internal(n - m > 0);
mi_assert_internal(n - m < MI_BFIELD_BITS);
bool field2_is_clear;
if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &field2_is_clear)) {
// we failed to clear the second field, restore the first one
mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), NULL);
return false;
}
if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = field1_is_clear && field2_is_clear; }
return true;
}
}
// Clear a full aligned bfield.
// static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) {
// mi_assert_internal(cidx < MI_BCHUNK_BITS);
// mi_assert_internal((cidx%MI_BFIELD_BITS) == 0);
// const size_t i = cidx / MI_BFIELD_BITS;
// return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear);
// }
// Try to atomically clear a sequence of `n` bits within a chunk.
// Returns true if all bits transitioned from 1 to 0,
// and false otherwise leaving all bit fields as is.
// Note: this is the complex one as we need to unwind partial atomic operations if we fail halfway..
// `maybe_all_clear` is set to `true` if all the bfields involved become zero.
mi_decl_noinline static bool mi_bchunk_try_clearNC(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
mi_assert_internal(n>0);
if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = true; }
if (n==0) return true;
// first field
const size_t start_idx = cidx % MI_BFIELD_BITS;
const size_t start_field = cidx / MI_BFIELD_BITS;
size_t field = start_field;
size_t m = MI_BFIELD_BITS - start_idx; // m are the bits to clear in this field
if (m > n) { m = n; }
mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
mi_assert_internal(start_field < MI_BCHUNK_FIELDS);
const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx);
bool maybe_all_clear;
if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &maybe_all_clear)) return false;
// done?
mi_assert_internal(m <= n);
n -= m;
// continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
// mid fields?
while (n >= MI_BFIELD_BITS) {
field++;
mi_assert_internal(field < MI_BCHUNK_FIELDS);
bool field_is_clear;
if (!mi_bfield_atomic_try_clearX(&chunk->bfields[field], &field_is_clear)) goto restore;
maybe_all_clear = maybe_all_clear && field_is_clear;
n -= MI_BFIELD_BITS;
}
// last field?
if (n > 0) {
mi_assert_internal(n < MI_BFIELD_BITS);
field++;
mi_assert_internal(field < MI_BCHUNK_FIELDS);
const mi_bfield_t mask_end = mi_bfield_mask(n, 0);
bool field_is_clear;
if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_end, &field_is_clear)) goto restore;
maybe_all_clear = maybe_all_clear && field_is_clear;
}
if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; }
return true;
restore:
// `field` is the index of the field that failed to set atomically; we need to restore all previous fields
mi_assert_internal(field > start_field);
while( field > start_field) {
field--;
if (field == start_field) {
mi_bfield_atomic_set_mask(&chunk->bfields[field], mask_start, NULL);
}
else {
mi_bfield_atomic_setX(&chunk->bfields[field], NULL); // mid-field: set all bits again
}
}
return false;
}
static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
mi_assert_internal(n>0);
// if (n==MI_BFIELD_BITS) return mi_bchunk_try_clearX(chunk, cidx, maybe_all_clear);
if (n<=MI_BFIELD_BITS) return mi_bchunk_try_clearNX(chunk, cidx, n, maybe_all_clear);
return mi_bchunk_try_clearNC(chunk, cidx, n, maybe_all_clear);
}
// ------- mi_bchunk_try_find_and_clear ---------------------------------------
#if MI_OPT_SIMD && defined(__AVX2__)
mi_decl_maybe_unused static inline __m256i mi_mm256_zero(void) {
return _mm256_setzero_si256();
}
mi_decl_maybe_unused static inline __m256i mi_mm256_ones(void) {
return _mm256_set1_epi64x(~0);
}
mi_decl_maybe_unused static inline bool mi_mm256_is_ones(__m256i vec) {
return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
}
mi_decl_maybe_unused static inline bool mi_mm256_is_zero( __m256i vec) {
return _mm256_testz_si256(vec,vec);
}
#endif
static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
// note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever
// as the compiler won't reload the registers vec1 and vec2 from memory again.
const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]);
size_t idx;
if (mi_bfield_find_least_bit(b, &idx)) { // find the least bit
if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], mi_bfield_mask(1,idx), b, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true;
}
}
return false;
}
// Find least 1-bit in a chunk and try to clear it atomically
// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// This is used to find free slices and abandoned pages and should be efficient.
// todo: try neon version
static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
for(int tries=0; tries<4; tries++) { // paranoia: at most 4 tries
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0)
const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear)
// mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
if (mask==0) return false;
mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
const size_t chunk_idx = _tzcnt_u32(mask) / 8;
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
// try again
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
// we add an explicit memory barrier as older gcc compilers do not reload the registers even with an atomic acquire (issue #1206)
#if defined(__GNUC__)
__asm __volatile ("" : : "g"(chunk) : "memory");
#endif
}
#elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
for(int tries=0; tries<4; tries++) { // paranoia: at most 4 tries
size_t chunk_idx = 0;
#if 0
// one vector at a time
__m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
if (mi_mm256_is_zero(vec)) {
chunk_idx += 4;
vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1);
}
const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0)
const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear)
// mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
if (mask==0) return false;
mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
chunk_idx += _tzcnt_u32(mask) / 8;
#else
// a cache line is 64b so we can just as well load all at the same time
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
const __m256i cmpv = mi_mm256_zero();
const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == 0 ? 0xFF : 0)
const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == 0 ? 0xFF : 0)
const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear)
const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte (so each 8 bits are all set or clear)
const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
// mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
if (mask==0) return false;
mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
chunk_idx = mi_ctz(mask) / 8;
#endif
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
// try again
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
// we add an explicit memory barrier as older gcc compilers do not reload the registers even with an atomic acquire (issue #1206)
#if defined(__GNUC__)
__asm __volatile ("" : : "g"(chunk) : "memory");
#endif
}
#elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
for(int tries=0; tries<4; tries++) { // paranoia: at most 4 tries
// a cache line is 64b so we can just as well load all at the same time (?)
const uint64x2_t vzero1_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields)); // 2x64 bit is_zero
const uint64x2_t vzero1_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 2)); // 2x64 bit is_zero
const uint64x2_t vzero2_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 4)); // 2x64 bit is_zero
const uint64x2_t vzero2_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 6)); // 2x64 bit is_zero
const uint32x4_t vzero1 = vuzp1q_u32(vreinterpretq_u32_u64(vzero1_lo),vreinterpretq_u32_u64(vzero1_hi)); // unzip even elements: narrow to 4x32 bit is_zero ()
const uint32x4_t vzero2 = vuzp1q_u32(vreinterpretq_u32_u64(vzero2_lo),vreinterpretq_u32_u64(vzero2_hi)); // unzip even elements: narrow to 4x32 bit is_zero ()
const uint32x4_t vzero1x = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero1), 24)); // shift-right 2x32bit elem by 24: lo 16 bits contain the 2 lo bytes
const uint32x4_t vzero2x = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero2), 24));
const uint16x8_t vzero12 = vreinterpretq_u16_u32(vuzp1q_u32(vzero1x,vzero2x)); // unzip even 32-bit elements into one vector
const uint8x8_t vzero = vmovn_u16(vzero12); // narrow the bottom 16-bits
const uint64_t mask = ~vget_lane_u64(vreinterpret_u64_u8(vzero), 0); // 1 byte for each bfield (0xFF => bfield has a bit set)
if (mask==0) return false;
mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
const size_t chunk_idx = mi_ctz(mask) / 8;
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
// try again
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
// we add an explicit memory barrier as older gcc compilers do not reload the registers even with an atomic acquire (issue #1206)
#if defined(__GNUC__)
__asm __volatile ("" : : "g"(chunk) : "memory");
#endif
}
#else
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx)) return true;
}
#endif
return false;
}
static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
mi_assert_internal(n==1); MI_UNUSED(n);
return mi_bchunk_try_find_and_clear(chunk, pidx);
}
mi_decl_maybe_unused static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
// has_set8 has low bit in each byte set if the byte in x == 0xFF
const mi_bfield_t has_set8 =
((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F
(b & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80
>> 7; // shift high bit to low bit
size_t idx;
if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
mi_assert_internal((idx%8)==0);
if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], (mi_bfield_t)0xFF << idx, b, NULL)) { // unset the byte atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
return true;
}
}
return false;
}
// find least aligned byte in a chunk with all bits set, and try unset it atomically
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// Used to find medium size pages in the free blocks.
// todo: try neon version
static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) {
// since a cache-line is 64b, load all at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
const __m256i cmpv = mi_mm256_ones();
const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0)
const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0)
const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte
const uint32_t mask2 = _mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte
const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
// mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared)
if (mask==0) return false;
const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk
const size_t chunk_idx = bidx / 8;
const size_t idx = (bidx % 8)*8;
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
return true;
}
// try again
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded }
}
#else
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true;
}
return false;
#endif
}
static inline bool mi_bchunk_try_find_and_clear_8(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
mi_assert_internal(n==8); MI_UNUSED(n);
return mi_bchunk_try_find_and_clear8(chunk, pidx);
}
// find a sequence of `n` bits in a chunk with `0 < n <= MI_BFIELD_BITS` with all bits set,
// and try to clear them atomically.
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
// will cross bfield boundaries.
mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n == 0 || n > MI_BFIELD_BITS) return false;
const mi_bfield_t mask = mi_bfield_mask(n, 0);
// for all fields in the chunk
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
mi_bfield_t b0 = mi_atomic_load_relaxed(&chunk->bfields[i]);
mi_bfield_t b = b0;
size_t idx;
// is there a range inside the field?
while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
if (idx + n > MI_BFIELD_BITS) break; // too short: maybe cross over, or continue with the next field
const size_t bmask = mask<<idx;
mi_assert_internal(bmask>>idx == mask);
if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically
if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[i], bmask, b0, NULL)) {
*pidx = (i*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
return true;
}
else {
// if we failed to atomically commit, reload b and try again from the start
b = b0 = mi_atomic_load_acquire(&chunk->bfields[i]);
}
}
else {
// advance by clearing the least run of ones, for example, with n>=4, idx=2:
// b = 1111 1101 1010 1100
// .. + (1<<idx) = 1111 1101 1011 0000
// .. & b = 1111 1101 1010 0000
b = b & (b + (mi_bfield_one() << idx));
}
}
// check if we can cross into the next bfield
if (b!=0 && i < MI_BCHUNK_FIELDS-1) {
const size_t post = mi_bfield_clz(~b);
if (post > 0) {
const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1]));
if (post + pre >= n) {
// it fits -- try to claim it atomically
const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post);
if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) {
// we cleared all atomically
*pidx = cidx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
return true;
}
}
}
}
}
return false;
}
// find a sequence of `n` bits in a chunk with `n <= MI_BCHUNK_BITS` with all bits set,
// and try to clear them atomically.
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
// This can cross bfield boundaries.
static mi_decl_noinline bool mi_bchunk_try_find_and_clearNC(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
// we first scan ahead to see if there is a range of `n` set bits, and only then try to clear atomically
mi_assert_internal(n>0);
const size_t skip_count = (n-1)/MI_BFIELD_BITS;
size_t cidx;
for (size_t i = 0; i < MI_BCHUNK_FIELDS - skip_count; i++)
{
size_t m = n; // bits to go
// first field
mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
size_t ones = mi_bfield_clz(~b);
cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - ones); // start index
if (ones >= m) {
// we found enough bits already!
m = 0;
}
else if (ones > 0) {
// keep scanning further fields until we have enough bits
m -= ones;
size_t j = 1; // field count from i
while (i+j < MI_BCHUNK_FIELDS) {
mi_assert_internal(m > 0);
b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
ones = mi_bfield_ctz(~b);
if (ones >= m) {
// we found enough bits
m = 0;
break;
}
else if (ones == MI_BFIELD_BITS) {
// not enough yet, proceed to the next field
j++;
m -= MI_BFIELD_BITS;
}
else {
// the range was not enough, start from scratch
i = i + j - 1; // no need to re-scan previous fields, except the last one (with clz this time)
mi_assert_internal(m>0);
break;
}
}
}
// did we find a range?
if (m==0) {
if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) {
// we cleared all atomically
*pidx = cidx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
return true;
}
// note: if we fail for a small `n` on the first field, we don't rescan that field (as `i` is incremented)
}
// otherwise continue searching
}
return false;
}
// ------- mi_bchunk_clear_once_set ---------------------------------------
static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
mi_assert_internal(cidx < MI_BCHUNK_BITS);
const size_t i = cidx / MI_BFIELD_BITS;
const size_t idx = cidx % MI_BFIELD_BITS;
mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx);
}
// ------- mi_bitmap_all_are_clear ---------------------------------------
// are all bits in a bitmap chunk clear?
static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
return mi_mm256_is_zero(vec);
#elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
// a 64b cache-line contains the entire chunk anyway so load both at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2)));
#elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
const uint64x2_t v0 = vld1q_u64((uint64_t*)chunk->bfields);
const uint64x2_t v1 = vld1q_u64((uint64_t*)chunk->bfields + 2);
const uint64x2_t v2 = vld1q_u64((uint64_t*)chunk->bfields + 4);
const uint64x2_t v3 = vld1q_u64((uint64_t*)chunk->bfields + 6);
const uint64x2_t v = vorrq_u64(vorrq_u64(v0,v1),vorrq_u64(v2,v3));
return (vmaxvq_u32(vreinterpretq_u32_u64(v)) == 0);
#else
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
}
return true;
#endif
}
// are all bits in a bitmap chunk set?
static inline bool mi_bchunk_all_are_set_relaxed(mi_bchunk_t* chunk) {
#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
return mi_mm256_is_ones(vec);
#elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
// a 64b cache-line contains the entire chunk anyway so load both at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
return (mi_mm256_is_ones(_mm256_and_si256(vec1, vec2)));
#elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
const uint64x2_t v0 = vld1q_u64((uint64_t*)chunk->bfields);
const uint64x2_t v1 = vld1q_u64((uint64_t*)chunk->bfields + 2);
const uint64x2_t v2 = vld1q_u64((uint64_t*)chunk->bfields + 4);
const uint64x2_t v3 = vld1q_u64((uint64_t*)chunk->bfields + 6);
const uint64x2_t v = vandq_u64(vandq_u64(v0,v1),vandq_u64(v2,v3));
return (vminvq_u32(vreinterpretq_u32_u64(v)) == 0xFFFFFFFFUL);
#else
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
if (~mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
}
return true;
#endif
}
static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) {
for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) {
i--;
mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
size_t idx;
if (mi_bsr(b, &idx)) {
*pidx = (i*MI_BFIELD_BITS) + idx;
return true;
}
}
return false;
}
static bool mi_bchunk_bsr_inv(mi_bchunk_t* chunk, size_t* pidx) {
for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) {
i--;
mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
size_t idx;
if (mi_bsr(~b, &idx)) {
*pidx = (i*MI_BFIELD_BITS) + idx;
return true;
}
}
return false;
}
static size_t mi_bchunk_popcount(mi_bchunk_t* chunk) {
size_t popcount = 0;
for (size_t i = 0; i < MI_BCHUNK_FIELDS; i++) {
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
popcount += mi_bfield_popcount(b);
}
return popcount;
}
/* --------------------------------------------------------------------------------
bitmap chunkmap
-------------------------------------------------------------------------------- */
static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL);
}
static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
// check if the corresponding chunk is all clear
if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) return false;
// clear the chunkmap bit
mi_bchunk_clear(&bitmap->chunkmap, chunk_idx, NULL);
// .. but a concurrent set may have happened in between our all-clear test and the clearing of the
// bit in the mask. We check again to catch this situation.
if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) {
mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL);
return false;
}
return true;
}
/* --------------------------------------------------------------------------------
bitmap
-------------------------------------------------------------------------------- */
size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) {
mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0);
bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS);
mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT);
mi_assert_internal(bit_count > 0);
const size_t chunk_count = bit_count / MI_BCHUNK_BITS;
mi_assert_internal(chunk_count >= 1);
const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE);
mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 );
if (pchunk_count != NULL) { *pchunk_count = chunk_count; }
return size;
}
// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
// returns the size of the bitmap
size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) {
size_t chunk_count;
const size_t size = mi_bitmap_size(bit_count, &chunk_count);
if (!already_zero) {
_mi_memzero_aligned(bitmap, size);
}
mi_atomic_store_release(&bitmap->chunk_count, chunk_count);
mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT);
return size;
}
// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) {
mi_assert_internal(n>0);
// start chunk and index
size_t chunk_idx = idx / MI_BCHUNK_BITS;
const size_t cidx = idx % MI_BCHUNK_BITS;
const size_t ccount = _mi_divide_up(n, MI_BCHUNK_BITS);
// first update the chunkmap
mi_bchunk_setN(cmap, chunk_idx, ccount, NULL);
// first chunk
size_t m = MI_BCHUNK_BITS - cidx;
if (m > n) { m = n; }
mi_bchunk_setN(&chunks[chunk_idx], cidx, m, NULL);
// n can be large so use memset for efficiency for all in-between chunks
chunk_idx++;
n -= m;
const size_t mid_chunks = n / MI_BCHUNK_BITS;
if (mid_chunks > 0) {
_mi_memset(&chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE);
chunk_idx += mid_chunks;
n -= (mid_chunks * MI_BCHUNK_BITS);
}
// last chunk
if (n > 0) {
mi_assert_internal(n < MI_BCHUNK_BITS);
mi_bchunk_setN(&chunks[chunk_idx], 0, n, NULL);
}
}
// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
mi_assert_internal(n>0);
mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
mi_bchunks_unsafe_setN(&bitmap->chunks[0], &bitmap->chunkmap, idx, n);
}
// ------- mi_bitmap_xset ---------------------------------------
// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* palready_set) {
mi_assert_internal(n>0);
const size_t maxbits = mi_bitmap_max_bits(bitmap);
mi_assert_internal(idx + n <= maxbits);
if (idx+n > maxbits) { // paranoia
if (idx >= maxbits) return false;
n = maxbits - idx;
}
// iterate through the chunks
size_t chunk_idx = idx / MI_BCHUNK_BITS;
size_t cidx = idx % MI_BCHUNK_BITS;
bool were_allclear = true;
size_t already_set = 0;
while (n > 0) {
const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
size_t _already_set = 0;
were_allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, m, &_already_set) && were_allclear;
already_set += _already_set;
mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
mi_assert_internal(m <= n);
n -= m;
cidx = 0;
chunk_idx++;
}
if (palready_set != NULL) { *palready_set = already_set; }
return were_allclear;
}
// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 1's to 0's.
bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
mi_assert_internal(n>0);
const size_t maxbits = mi_bitmap_max_bits(bitmap);
mi_assert_internal(idx + n <= maxbits);
if (idx+n > maxbits) { // paranoia
if (idx >= maxbits) return false;
n = maxbits - idx;
}
// iterate through the chunks
size_t chunk_idx = idx / MI_BCHUNK_BITS;
size_t cidx = idx % MI_BCHUNK_BITS;
bool were_allset = true;
while (n > 0) {
const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
bool maybe_all_clear = false;
were_allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, m, &maybe_all_clear) && were_allset;
if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
mi_assert_internal(m <= n);
n -= m;
cidx = 0;
chunk_idx++;
}
return were_allset;
}
// Count bits set in a range of `n` bits.
size_t mi_bitmap_popcountN( mi_bitmap_t* bitmap, size_t idx, size_t n) {
mi_assert_internal(n>0);
const size_t maxbits = mi_bitmap_max_bits(bitmap);
mi_assert_internal(idx + n <= maxbits);
if (idx+n > maxbits) { // paranoia
if (idx >= maxbits) return 0;
n = maxbits - idx;
}
// iterate through the chunks
size_t chunk_idx = idx / MI_BCHUNK_BITS;
size_t cidx = idx % MI_BCHUNK_BITS;
size_t popcount = 0;
while (n > 0) {
const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
popcount += mi_bchunk_popcountN(&bitmap->chunks[chunk_idx], cidx, m);
mi_assert_internal(m <= n);
n -= m;
cidx = 0;
chunk_idx++;
}
return popcount;
}
// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) {
return mi_bitmap_setN(bitmap, idx, 1, NULL);
}
bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
return mi_bitmap_clearN(bitmap, idx, 1);
}
// ------- mi_bitmap_is_xset ---------------------------------------
// Is a sequence of n bits already all set/cleared?
bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
mi_assert_internal(n>0);
const size_t maxbits = mi_bitmap_max_bits(bitmap);
mi_assert_internal(idx + n <= maxbits);
if (idx+n > maxbits) { // paranoia
if (idx >= maxbits) return false;
n = maxbits - idx;
}
// iterate through the chunks
size_t chunk_idx = idx / MI_BCHUNK_BITS;
size_t cidx = idx % MI_BCHUNK_BITS;
bool xset = true;
while (n > 0 && xset) {
const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
xset = mi_bchunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m) && xset;
mi_assert_internal(m <= n);
n -= m;
cidx = 0;
chunk_idx++;
}
return xset;
}
bool mi_bitmap_is_all_clear(mi_bitmap_t* bitmap) {
return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, 0, mi_bitmap_max_bits(bitmap));
}
/* --------------------------------------------------------------------------------
Iterate through a bfield
-------------------------------------------------------------------------------- */
// Cycle iteration through a bitfield. This is used to space out threads
// so there is less chance of contention. When searching for a free page we
// like to first search only the accessed part (so we reuse better). This
// high point is called the `cycle`.
//
// We then iterate through the bitfield as:
// first: [start, cycle>
// then : [0, start>
// then : [cycle, MI_BFIELD_BITS>
//
// The start is determined usually as `tseq % cycle` to have each thread
// start at a different spot.
// - We use `popcount` to improve branch prediction (maybe not needed? can we simplify?)
// - The `cycle_mask` is the part `[start, cycle>`.
#define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \
mi_assert_internal(start <= cycle); \
mi_assert_internal(start < MI_BFIELD_BITS); \
mi_assert_internal(cycle <= MI_BFIELD_BITS); \
const mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \
size_t _bcount##SUF = mi_bfield_popcount(bfield); \
mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\
while(_bcount##SUF > 0) { \
_bcount##SUF--;\
if (_b##SUF==0) { _b##SUF = bfield & ~_cycle_mask##SUF; } /* process [0,start> + [cycle, MI_BFIELD_BITS> next */ \
/* size_t name_idx; */ \
const bool _found##SUF = mi_bfield_find_least_bit(_b##SUF,&name_idx); \
_b##SUF = mi_bfield_clear_least_bit(_b##SUF); /* clear early so `continue` works */ \
mi_assert_internal(_found##SUF); MI_UNUSED(_found##SUF); \
{ \
#define mi_bfield_iterate_end(SUF) \
} \
} \
}
#define mi_bfield_cycle_iterate(bfield,tseq,cycle,name_idx,SUF) { \
const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); /* or: 0 to always search from the start? */\
mi_bfield_iterate(bfield,_start##SUF,cycle,name_idx,SUF)
#define mi_bfield_cycle_iterate_end(SUF) \
mi_bfield_iterate_end(SUF); \
}
/* --------------------------------------------------------------------------------
mi_bitmap_find
(used to find free pages)
-------------------------------------------------------------------------------- */
typedef bool (mi_bitmap_visit_fun_t)(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* idx, void* arg1, void* arg2);
// Go through the bitmap and for every sequence of `n` set bits, call the visitor function.
// If it returns `true` stop the search.
static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bitmap_visit_fun_t* on_find, void* arg1, void* arg2)
{
const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
for (size_t i = 0; i < chunkmap_max; i++) {
// and for each chunkmap entry we iterate over its bits to find the chunks
const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
size_t hi;
if (mi_bfield_find_highest_bit(cmap_entry, &hi)) {
size_t eidx = 0;
mi_bfield_cycle_iterate(cmap_entry, tseq%8, hi+1, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
{
mi_assert_internal(eidx <= MI_BFIELD_BITS);
const size_t chunk_idx = i*MI_BFIELD_BITS + eidx;
mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) {
return true;
}
}
mi_bfield_cycle_iterate_end(Y);
}
}
return false;
}
/* --------------------------------------------------------------------------------
Bitmap: try_find_and_claim -- used to allocate abandoned pages
note: the compiler will fully inline the indirect function call
-------------------------------------------------------------------------------- */
typedef struct mi_claim_fun_data_s {
mi_arena_t* arena;
} mi_claim_fun_data_t;
static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2)
{
mi_assert_internal(n==1); MI_UNUSED(n);
mi_claim_fun_t* claim_fun = (mi_claim_fun_t*)arg1;
mi_claim_fun_data_t* claim_data = (mi_claim_fun_data_t*)arg2;
size_t cidx;
if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) {
const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
bool keep_set = true;
if ((*claim_fun)(slice_index, claim_data->arena, &keep_set)) {
// success!
mi_assert_internal(!keep_set);
*pidx = slice_index;
return true;
}
else {
// failed to claim it, set abandoned mapping again (unless the page was freed)
if (keep_set) {
const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx, NULL);
mi_assert_internal(wasclear); MI_UNUSED(wasclear);
}
}
}
else {
// we may find that all are cleared only on a second iteration but that is ok as
// the chunkmap is a conservative approximation.
mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
}
return false;
}
// Find a set bit in the bitmap and try to atomically clear it and claim it.
// (Used to find pages in the pages_abandoned bitmaps.)
mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
mi_claim_fun_t* claim, mi_arena_t* arena )
{
mi_claim_fun_data_t claim_data = { arena };
return mi_bitmap_find(bitmap, tseq, 1, pidx, &mi_bitmap_try_find_and_claim_visit, (void*)claim, &claim_data);
}
bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx) {
const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
for (size_t i = chunkmap_max; i > 0; ) {
i--;
mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
size_t cmap_idx;
if (mi_bsr(cmap,&cmap_idx)) {
// highest chunk
const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
size_t cidx;
if (mi_bchunk_bsr(&bitmap->chunks[chunk_idx], &cidx)) {
*idx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
return true;
}
}
}
return false;
}
// Return count of all set bits in a bitmap.
size_t mi_bitmap_popcount(mi_bitmap_t* bitmap) {
// for all chunkmap entries
size_t popcount = 0;
const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
for (size_t i = 0; i < chunkmap_max; i++) {
mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
size_t cmap_idx;
// for each chunk (corresponding to a set bit in a chunkmap entry)
while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
// count bits in a chunk
popcount += mi_bchunk_popcount(&bitmap->chunks[chunk_idx]);
}
}
return popcount;
}
// Clear a bit once it is set.
void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) {
mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
const size_t chunk_idx = idx / MI_BCHUNK_BITS;
const size_t cidx = idx % MI_BCHUNK_BITS;
mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx);
}
// Visit all set bits in a bitmap.
// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore?
bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
// for all chunkmap entries
const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
for(size_t i = 0; i < chunkmap_max; i++) {
mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
size_t cmap_idx;
// for each chunk (corresponding to a set bit in a chunkmap entry)
while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
// for each chunk field
mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
size_t bidx;
while (mi_bfield_foreach_bit(&b, &bidx)) {
const size_t idx = base_idx + bidx;
if (!visit(idx, 1, arena, arg)) return false;
}
}
}
}
return true;
}
// Visit all set bits in a bitmap but try to return ranges (within bfields) if possible.
// Also clear those ranges atomically.
// Used by purging to purge larger ranges when possible
// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore?
bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
// for all chunkmap entries
const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
for (size_t i = 0; i < chunkmap_max; i++) {
mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
size_t cmap_idx;
// for each chunk (corresponding to a set bit in a chunkmap entry)
while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
// for each chunk field
mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
mi_bfield_t b = mi_atomic_exchange_relaxed(&chunk->bfields[j], (mi_bfield_t)0);
#if MI_DEBUG > 1
const size_t bpopcount = mi_popcount(b);
size_t rngcount = 0;
#endif
size_t bidx;
while (mi_bfield_find_least_bit(b, &bidx)) {
size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx
#if MI_DEBUG > 1
rngcount += rng;
#endif
const size_t idx = base_idx + bidx;
mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS);
mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS);
mi_assert_internal((idx / MI_BCHUNK_BITS) < mi_bitmap_chunk_count(bitmap));
if (!visit(idx, rng, arena, arg)) {
// break early: reset the non-visited bits
if (b!=0) {
mi_atomic_or_relaxed(&chunk->bfields[j], b);
}
return false;
}
// clear rng bits in b
b = b & ~mi_bfield_mask(rng, bidx);
}
mi_assert_internal(rngcount == bpopcount);
}
}
}
return true;
}
// Visit all set bits in a bitmap but try to return ranges (within bfields) if possible,
// but only in chunks of at least `rngslices` slices (that are also aligned at `rngslices`)
// and clear those ranges atomically.
// However, the `rngslices` are capped at `MI_BFIELD_BITS` at most.
// Used by purging to purge larger ranges when possible. With transparent huge pages we only
// want to purge whole huge pages (2 MiB) at a time which is what the `rngslices` parameter achieves.
bool _mi_bitmap_forall_setc_rangesn(mi_bitmap_t* bitmap, size_t rngslices, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg)
{
// use the generic routine for `rngslices<=1` (as that one finds longest ranges at a time)
if (rngslices<=1) {
return _mi_bitmap_forall_setc_ranges(bitmap, visit, arena, arg);
}
// mi_assert_internal(rngslices <= MI_BFIELD_BITS);
if (rngslices > MI_BFIELD_BITS) { rngslices = MI_BFIELD_BITS; } // cap at MI_BFIELD_BITS at most
// for all chunkmap entries
const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
for (size_t i = 0; i < chunkmap_max; i++) {
mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
size_t cmap_idx;
// for each chunk (corresponding to a set bit in a chunkmap entry)
while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
// for each chunk field
mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
mi_bfield_t b = mi_atomic_exchange_relaxed(&chunk->bfields[j], (mi_bfield_t)0); // atomic clear
mi_bfield_t skipped = 0; // but track which bits we skip so we can restore them
for(size_t shift = 0; rngslices + shift <= MI_BFIELD_BITS; shift += rngslices) { // per `rngslices` to keep alignment
const mi_bfield_t rngmask = mi_bfield_mask(rngslices, shift);
if ((b & rngmask) == rngmask) {
const size_t idx = base_idx + shift;
if (!visit(idx, rngslices, arena, arg)) {
// break early: restore non-visited entries
mi_bfield_t notyet_visited = 0;
if (shift + rngslices < MI_BFIELD_BITS) {
notyet_visited = (b & (~(mi_bfield_t)0 << (shift + rngslices)));
}
mi_assert_internal((notyet_visited & skipped) == 0);
if ((notyet_visited | skipped) != 0) {
mi_atomic_or_relaxed(&chunk->bfields[j], notyet_visited | skipped);
}
return false;
}
}
else {
skipped = skipped | (b & rngmask);
}
}
if (skipped != 0) {
mi_atomic_or_relaxed(&chunk->bfields[j], skipped);
}
}
}
}
return true;
}
/* --------------------------------------------------------------------------------
binned bitmap's
-------------------------------------------------------------------------------- */
size_t mi_bbitmap_size(size_t bit_count, size_t* pchunk_count) {
// mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0);
bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS);
mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT);
mi_assert_internal(bit_count > 0);
const size_t chunk_count = bit_count / MI_BCHUNK_BITS;
mi_assert_internal(chunk_count >= 1);
const size_t size = offsetof(mi_bbitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE);
mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 );
if (pchunk_count != NULL) { *pchunk_count = chunk_count; }
return size;
}
// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
// returns the size of the bitmap
size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero) {
size_t chunk_count;
const size_t size = mi_bbitmap_size(bit_count, &chunk_count);
if (!already_zero) {
_mi_memzero_aligned(bbitmap, size);
}
mi_atomic_store_release(&bbitmap->chunk_count, chunk_count);
mi_assert_internal(mi_atomic_load_relaxed(&bbitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT);
return size;
}
void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
mi_assert_internal(n>0);
mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap));
mi_bchunks_unsafe_setN(&bbitmap->chunks[0], &bbitmap->chunkmap, idx, n);
}
bool mi_bbitmap_bsr_inv(mi_bbitmap_t* bbitmap, size_t* idx) {
const size_t chunk_count = mi_bbitmap_chunk_count(bbitmap);
const size_t chunkmap_max = _mi_divide_up(chunk_count, MI_BFIELD_BITS);
size_t skip_at_top = chunk_count % MI_BFIELD_BITS;
for (size_t i = chunkmap_max; i > 0; ) {
i--;
mi_bfield_t cmap = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[i]);
size_t cmap_idx;
// don't consider top 0 bits; set those to 1 here
if (skip_at_top > 0) {
const size_t mask_top = (~mi_bfield_zero()) << (MI_BFIELD_BITS - skip_at_top);
skip_at_top = 0; // only for the first iteration
cmap |= mask_top;
}
if (mi_bsr(~cmap, &cmap_idx)) {
// highest chunk
const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
size_t cidx;
if (mi_bchunk_bsr_inv(&bbitmap->chunks[chunk_idx], &cidx)) {
*idx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
return true;
}
}
}
return false;
}
/* --------------------------------------------------------------------------------
binned bitmap used to track free slices
-------------------------------------------------------------------------------- */
// Assign a specific size bin to a chunk
static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_chunkbin_t bin) {
mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin < MI_CBIN_NONE; ibin = mi_chunkbin_inc(ibin)) {
if (ibin == bin) {
const bool was_clear = mi_bchunk_set(& bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
if (was_clear) { mi_os_stat_increase(chunk_bins[ibin],1); }
}
else {
const bool was_set = mi_bchunk_clear(&bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
if (was_set) { mi_os_stat_decrease(chunk_bins[ibin],1); }
}
}
}
mi_chunkbin_t mi_bbitmap_debug_get_bin(const mi_bchunkmap_t* chunkmap_bins, size_t chunk_idx) {
for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin < MI_CBIN_NONE; ibin = mi_chunkbin_inc(ibin)) {
if (mi_bchunk_is_xsetN(MI_BIT_SET, &chunkmap_bins[ibin], chunk_idx, 1)) {
return ibin;
}
}
return MI_CBIN_NONE;
}
// Track the index of the highest chunk that is accessed.
static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) {
size_t oldmax = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed);
if mi_unlikely(chunk_idx > oldmax) {
mi_atomic_cas_strong_relaxed(&bbitmap->chunk_max_accessed, &oldmax, chunk_idx);
}
}
// Set a bit in the chunkmap
static void mi_bbitmap_chunkmap_set(mi_bbitmap_t* bbitmap, size_t chunk_idx, bool check_all_set) {
mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
if (check_all_set) {
if (mi_bchunk_all_are_set_relaxed(&bbitmap->chunks[chunk_idx])) {
// all slices are free in this chunk: return back to the NONE bin
mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, MI_CBIN_NONE);
}
}
mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL);
mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx);
}
static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_idx) {
mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
// check if the corresponding chunk is all clear
if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) return false;
// clear the chunkmap bit
mi_bchunk_clear(&bbitmap->chunkmap, chunk_idx, NULL);
// .. but a concurrent set may have happened in between our all-clear test and the clearing of the
// bit in the mask. We check again to catch this situation. (note: mi_bchunk_clear must be acq-rel)
if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) {
mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL);
return false;
}
mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx);
return true;
}
/* --------------------------------------------------------------------------------
mi_bbitmap_setN, try_clearN, and is_xsetN
(used to find free pages)
-------------------------------------------------------------------------------- */
// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
mi_assert_internal(n>0);
const size_t maxbits = mi_bbitmap_max_bits(bbitmap);
mi_assert_internal(idx + n <= maxbits);
if (idx+n > maxbits) { // paranoia
if (idx >= maxbits) return false;
n = maxbits - idx;
}
// iterate through the chunks
size_t chunk_idx = idx / MI_BCHUNK_BITS;
size_t cidx = idx % MI_BCHUNK_BITS;
bool were_allclear = true;
while (n > 0) {
const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
were_allclear = mi_bchunk_setN(&bbitmap->chunks[chunk_idx], cidx, m, NULL) && were_allclear;
mi_bbitmap_chunkmap_set(bbitmap, chunk_idx, true); // set afterwards
mi_assert_internal(m <= n);
n -= m;
cidx = 0;
chunk_idx++;
}
return were_allclear;
}
// ------- mi_bbitmap_try_clearNC ---------------------------------------
// Try to clear `n` bits at `idx` where `n <= MI_BCHUNK_BITS`.
bool mi_bbitmap_try_clearNC(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
mi_assert_internal(n>0);
mi_assert_internal(n<=MI_BCHUNK_BITS);
mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap));
const size_t chunk_idx = idx / MI_BCHUNK_BITS;
const size_t cidx = idx % MI_BCHUNK_BITS;
mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now)
mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
if (cidx + n > MI_BCHUNK_BITS) return false;
bool maybe_all_clear = false;
const bool cleared = mi_bchunk_try_clearN(&bbitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear);
if (cleared && maybe_all_clear) { mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); }
// note: we don't set the size class for an explicit try_clearN (only used by purging)
return cleared;
}
// ------- mi_bbitmap_is_xset ---------------------------------------
// Is a sequence of n bits already all set/cleared?
bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
mi_assert_internal(n>0);
const size_t maxbits = mi_bbitmap_max_bits(bbitmap);
mi_assert_internal(idx + n <= maxbits);
if (idx+n > maxbits) { // paranoia
if (idx >= maxbits) return false;
n = maxbits - idx;
}
// iterate through the chunks
size_t chunk_idx = idx / MI_BCHUNK_BITS;
size_t cidx = idx % MI_BCHUNK_BITS;
bool xset = true;
while (n > 0 && xset) {
const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
xset = mi_bchunk_is_xsetN(set, &bbitmap->chunks[chunk_idx], cidx, m) && xset;
mi_assert_internal(m <= n);
n -= m;
cidx = 0;
chunk_idx++;
}
return xset;
}
/* --------------------------------------------------------------------------------
mi_bbitmap_find
(used to find free pages)
-------------------------------------------------------------------------------- */
typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx);
// Go through the bbitmap and for every sequence of `n` set bits, call the visitor function.
// If it returns `true` stop the search.
//
// This is used for finding free blocks and it is important to be efficient (with 2-level bitscan)
// but also reduce fragmentation (through size bins).
static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* on_find)
{
// we space out threads to reduce contention
const size_t cmap_max_count = _mi_divide_up(mi_bbitmap_chunk_count(bbitmap),MI_BFIELD_BITS);
const size_t chunk_acc = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed);
const size_t cmap_acc = chunk_acc / MI_BFIELD_BITS;
const size_t cmap_acc_bits = 1 + (chunk_acc % MI_BFIELD_BITS);
// create a mask over the chunkmap entries to iterate over them efficiently
mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS);
const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0);
const size_t cmap_cycle = cmap_acc+1;
const mi_chunkbin_t bbin = mi_chunkbin_of(n);
// visit each cmap entry
size_t cmap_idx = 0;
mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
{
// and for each chunkmap entry we iterate over its bits to find the chunks
const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]);
const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
if (cmap_entry == 0) {
continue;
}
// get size bin masks
mi_bfield_t cmap_bins[MI_CBIN_COUNT] = { 0 };
cmap_bins[MI_CBIN_NONE] = cmap_entry;
for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin < MI_CBIN_NONE; ibin = mi_chunkbin_inc(ibin)) {
const mi_bfield_t cmap_bin = mi_atomic_load_relaxed(&bbitmap->chunkmap_bins[ibin].bfields[cmap_idx]);
cmap_bins[ibin] = cmap_bin & cmap_entry;
cmap_bins[MI_CBIN_NONE] &= ~cmap_bin; // clear bits that are in an assigned size bin
}
// consider only chunks for a particular size bin at a time
// this picks the best bin only within a cmap entry (~ 1GiB address space), but avoids multiple
// iterations through all entries.
mi_assert_internal(bbin < MI_CBIN_NONE);
for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin <= MI_CBIN_NONE;
// skip from bbin to NONE (so, say, a SMALL will never be placed in a OTHER, MEDIUM, or LARGE chunk to reduce fragmentation)
ibin = (ibin == bbin ? MI_CBIN_NONE : mi_chunkbin_inc(ibin)))
{
mi_assert_internal(ibin < MI_CBIN_COUNT);
const mi_bfield_t cmap_bin = cmap_bins[ibin];
size_t eidx = 0;
mi_bfield_cycle_iterate(cmap_bin, tseq, cmap_entry_cycle, eidx, Y)
{
// assertion doesn't quite hold as the max_accessed may be out-of-date
// mi_assert_internal(cmap_entry_cycle > eidx || ibin == MI_CBIN_NONE);
// get the chunk
const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
size_t cidx;
if ((*on_find)(chunk, n, &cidx)) {
if (cidx==0 && ibin == MI_CBIN_NONE) { // only the first block determines the size bin
// this chunk is now reserved for the `bbin` size class
mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin);
}
*pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
return true;
}
else {
// todo: should _on_find_ return a boolean if there is a chance all are clear to avoid calling `try_clear?`
// we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation.
mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);
}
}
mi_bfield_cycle_iterate_end(Y);
}
}
mi_bfield_cycle_iterate_end(X);
return false;
}
/* --------------------------------------------------------------------------------
mi_bbitmap_try_find_and_clear -- used to find free pages
note: the compiler will fully inline the indirect function calls
-------------------------------------------------------------------------------- */
bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1);
}
bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8);
}
// bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
// return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X);
// }
bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
mi_assert_internal(n<=MI_BFIELD_BITS);
return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX);
}
bool mi_bbitmap_try_find_and_clearNC(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
mi_assert_internal(n<=MI_BCHUNK_BITS);
return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNC);
}
/* --------------------------------------------------------------------------------
mi_bbitmap_try_find_and_clear for huge objects spanning multiple chunks
-------------------------------------------------------------------------------- */
// Try to atomically clear `n` bits starting at `chunk_idx` where `n` can span over multiple chunks
static bool mi_bchunk_try_clearN_(mi_bbitmap_t* bbitmap, size_t chunk_idx, size_t n) {
mi_assert_internal((chunk_idx * MI_BCHUNK_BITS) + n <= mi_bbitmap_max_bits(bbitmap));
size_t m = n; // bits to go
size_t count = 0; // chunk count
while (m > 0) {
mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx + count];
if (!mi_bchunk_try_clearN(chunk, 0, (m > MI_BCHUNK_BITS ? MI_BCHUNK_BITS : m), NULL)) {
goto rollback;
}
m = (m <= MI_BCHUNK_BITS ? 0 : m - MI_BCHUNK_BITS);
count++;
}
return true;
rollback:
// we only need to reset chunks the we just fully cleared
while (count > 0) {
count--;
mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx + count];
mi_bchunk_setN(chunk, 0, MI_BCHUNK_BITS, NULL);
}
return false;
}
// Go through the bbitmap to find a sequence of `n` bits and clear them atomically where `n > MI_ARENA_MAX_CHUNK_OBJ_SIZE`
// Since these are very large object allocations we always search from the start and only consider starting at the start
// of a chunk (for fragmentation and efficiency).
// Todo: for now we try to find full empty chunks to cover `n` but we can allow a partial chunk at the end
// Todo: This scans directly through the chunks -- we might want to consult the cmap as well?
bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
MI_UNUSED(tseq);
mi_assert(n > 0); if (n==0) { return false; }
const size_t chunk_max = mi_bbitmap_chunk_count(bbitmap);
const size_t chunk_req = _mi_divide_up(n, MI_BCHUNK_BITS); // minimal number of chunks needed
if (chunk_max < chunk_req) { return false; }
// iterate through the chunks
size_t chunk_idx = 0;
while (chunk_idx <= chunk_max - chunk_req)
{
size_t count = 0; // chunk count
do {
mi_assert_internal(chunk_idx + count < chunk_max);
mi_bchunk_t* const chunk = &bbitmap->chunks[chunk_idx + count];
if (!mi_bchunk_all_are_set_relaxed(chunk)) {
break;
}
else {
count++;
}
}
while (count < chunk_req);
// did we find a suitable range?
if (count == chunk_req) {
// now try to claim it!
if (mi_bchunk_try_clearN_(bbitmap, chunk_idx, n)) {
*pidx = (chunk_idx * MI_BCHUNK_BITS);
for (size_t i = 0; i < count; i++) {
mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx + i, MI_CBIN_HUGE);
}
mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
return true;
}
}
// keep searching but skip the scanned range
chunk_idx += count+1;
}
return false;
}
|