File size: 149,513 Bytes
b7a7046 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 | import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from matplotlib.gridspec import GridSpec
from matplotlib.patches import FancyArrowPatch
from scipy.stats import norm
import os
import re
def setup_figure(title, rows, cols):
"""Initializes a new figure and grid layout with constrained_layout to avoid warnings."""
fig = plt.figure(figsize=(20, 10), constrained_layout=True)
fig.suptitle(title, fontsize=18, fontweight='bold')
gs = GridSpec(rows, cols, figure=fig)
return fig, gs
def plot_agent_env_loop(ax):
"""MDP & Environment: Agent-Environment Interaction Loop (Flowchart)."""
ax.axis('off')
ax.set_title("Agent-Environment Interaction", fontsize=12, fontweight='bold')
props = dict(boxstyle="round,pad=0.8", fc="ivory", ec="black", lw=1.5)
ax.text(0.5, 0.8, "Agent", ha="center", va="center", bbox=props, fontsize=12)
ax.text(0.5, 0.2, "Environment", ha="center", va="center", bbox=props, fontsize=12)
# Arrows
# Agent to Env: Action
ax.annotate("Action $A_t$", xy=(0.5, 0.35), xytext=(0.5, 0.65),
arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.5", lw=2))
# Env to Agent: State & Reward
ax.annotate("State $S_{t+1}$, Reward $R_{t+1}$", xy=(0.5, 0.65), xytext=(0.5, 0.35),
arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.5", lw=2, color='green'))
def plot_mdp_graph(ax):
"""MDP & Environment: Directed graph with probability-weighted arrows."""
G = nx.DiGraph()
# Corrected syntax: using a dictionary for edge attributes
G.add_edges_from([
('S0', 'S1', {'weight': 0.8}), ('S0', 'S2', {'weight': 0.2}),
('S1', 'S2', {'weight': 1.0}), ('S2', 'S0', {'weight': 0.5}), ('S2', 'S2', {'weight': 0.5})
])
pos = nx.spring_layout(G, seed=42)
nx.draw_networkx_nodes(ax=ax, G=G, pos=pos, node_size=1500, node_color='lightblue')
nx.draw_networkx_labels(ax=ax, G=G, pos=pos, font_weight='bold')
edge_labels = {(u, v): f"P={d['weight']}" for u, v, d in G.edges(data=True)}
nx.draw_networkx_edges(ax=ax, G=G, pos=pos, arrowsize=20, edge_color='gray', connectionstyle="arc3,rad=0.1")
nx.draw_networkx_edge_labels(ax=ax, G=G, pos=pos, edge_labels=edge_labels, font_size=9)
ax.set_title("MDP State Transition Graph", fontsize=12, fontweight='bold')
ax.axis('off')
def plot_reward_landscape(fig, gs):
"""MDP & Environment: 3D surface plot of a reward function."""
# Use the first available slot in gs (handled flexibly for dashboard vs save)
try:
ax = fig.add_subplot(gs[0, 1], projection='3d')
except IndexError:
ax = fig.add_subplot(gs[0, 0], projection='3d')
X = np.linspace(-5, 5, 50)
Y = np.linspace(-5, 5, 50)
X, Y = np.meshgrid(X, Y)
Z = np.sin(np.sqrt(X**2 + Y**2)) + (X * 0.1) # Simulated reward landscape
surf = ax.plot_surface(X, Y, Z, cmap='viridis', edgecolor='none', alpha=0.9)
ax.set_title("Reward Function Landscape", fontsize=12, fontweight='bold')
ax.set_xlabel('State X')
ax.set_ylabel('State Y')
ax.set_zlabel('Reward R(s)')
def plot_trajectory(ax):
"""MDP & Environment: Trajectory / Episode Sequence."""
ax.set_title("Trajectory Sequence", fontsize=12, fontweight='bold')
states = ['s0', 's1', 's2', 's3', 'sT']
actions = ['a0', 'a1', 'a2', 'a3']
rewards = ['r1', 'r2', 'r3', 'r4']
for i, s in enumerate(states):
ax.text(i, 0.5, s, ha='center', va='center', bbox=dict(boxstyle="circle", fc="white"))
if i < len(actions):
ax.annotate("", xy=(i+0.8, 0.5), xytext=(i+0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.text(i+0.5, 0.6, actions[i], ha='center', color='blue')
ax.text(i+0.5, 0.4, rewards[i], ha='center', color='red')
ax.set_xlim(-0.5, len(states)-0.5)
ax.set_ylim(0, 1)
ax.axis('off')
def plot_continuous_space(ax):
"""MDP & Environment: Continuous State/Action Space Visualization."""
np.random.seed(42)
x = np.random.randn(200, 2)
labels = np.linalg.norm(x, axis=1) > 1.0
ax.scatter(x[labels, 0], x[labels, 1], c='coral', alpha=0.6, label='High Reward')
ax.scatter(x[~labels, 0], x[~labels, 1], c='skyblue', alpha=0.6, label='Low Reward')
ax.set_title("Continuous State Space (2D Projection)", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_discount_decay(ax):
"""MDP & Environment: Discount Factor (gamma) Effect."""
t = np.arange(0, 20)
for gamma in [0.5, 0.9, 0.99]:
ax.plot(t, gamma**t, marker='o', markersize=4, label=rf"$\gamma={gamma}$")
ax.set_title(r"Discount Factor $\gamma^t$ Decay", fontsize=12, fontweight='bold')
ax.set_xlabel("Time steps (t)")
ax.set_ylabel("Weight")
ax.legend()
ax.grid(True, alpha=0.3)
def plot_value_heatmap(ax):
"""Value & Policy: State-Value Function V(s) Heatmap (Gridworld)."""
grid_size = 5
# Simulate a value landscape where the top right is the goal
values = np.zeros((grid_size, grid_size))
for i in range(grid_size):
for j in range(grid_size):
values[i, j] = -( (grid_size-1-i)**2 + (grid_size-1-j)**2 ) * 0.5
values[-1, -1] = 10.0 # Goal state
cax = ax.matshow(values, cmap='magma')
for (i, j), z in np.ndenumerate(values):
ax.text(j, i, f'{z:0.1f}', ha='center', va='center', color='white' if z < -5 else 'black', fontsize=9)
ax.set_title("State-Value Function V(s) Heatmap", fontsize=12, fontweight='bold', pad=15)
ax.set_xticks(range(grid_size))
ax.set_yticks(range(grid_size))
def plot_backup_diagram(ax):
"""Dynamic Programming: Policy Evaluation Backup Diagram."""
G = nx.DiGraph()
G.add_node("s", layer=0)
G.add_node("a1", layer=1); G.add_node("a2", layer=1)
G.add_node("s'_1", layer=2); G.add_node("s'_2", layer=2); G.add_node("s'_3", layer=2)
G.add_edges_from([("s", "a1"), ("s", "a2")])
G.add_edges_from([("a1", "s'_1"), ("a1", "s'_2"), ("a2", "s'_3")])
pos = {
"s": (0.5, 1),
"a1": (0.25, 0.5), "a2": (0.75, 0.5),
"s'_1": (0.1, 0), "s'_2": (0.4, 0), "s'_3": (0.75, 0)
}
nx.draw_networkx_nodes(ax=ax, G=G, pos=pos, nodelist=["s", "s'_1", "s'_2", "s'_3"], node_size=800, node_color='white', edgecolors='black')
nx.draw_networkx_nodes(ax=ax, G=G, pos=pos, nodelist=["a1", "a2"], node_size=300, node_color='black') # Action nodes are solid black dots
nx.draw_networkx_edges(ax=ax, G=G, pos=pos, arrows=True)
nx.draw_networkx_labels(ax=ax, G=G, pos=pos, labels={"s": "s", "s'_1": "s'", "s'_2": "s'", "s'_3": "s'"}, font_size=10)
ax.set_title("DP Policy Eval Backup", fontsize=12, fontweight='bold')
ax.set_ylim(-0.2, 1.2)
ax.axis('off')
def plot_action_value_q(ax):
"""Value & Policy: Action-Value Function Q(s,a) (Heatmap per action stack)."""
grid = np.random.rand(3, 3)
ax.imshow(grid, cmap='YlGnBu')
for (i, j), z in np.ndenumerate(grid):
ax.text(j, i, f'{z:0.1f}', ha='center', va='center', fontsize=8)
ax.set_title(r"Action-Value $Q(s, a_{up})$", fontsize=12, fontweight='bold')
ax.set_xticks([]); ax.set_yticks([])
def plot_policy_arrows(ax):
"""Value & Policy: Policy π(s) as arrow overlays on grid."""
grid_size = 4
ax.set_xlim(-0.5, grid_size-0.5)
ax.set_ylim(-0.5, grid_size-0.5)
for i in range(grid_size):
for j in range(grid_size):
dx, dy = np.random.choice([0, 0.3, -0.3]), np.random.choice([0, 0.3, -0.3])
if dx == 0 and dy == 0: dx = 0.3
ax.add_patch(FancyArrowPatch((j, i), (j+dx, i+dy), arrowstyle='->', mutation_scale=15))
ax.set_title(r"Policy $\pi(s)$ Arrows", fontsize=12, fontweight='bold')
ax.set_xticks(range(grid_size)); ax.set_yticks(range(grid_size)); ax.grid(True, alpha=0.2)
def plot_advantage_function(ax):
"""Value & Policy: Advantage Function A(s,a) = Q-V."""
actions = ['A1', 'A2', 'A3', 'A4']
advantage = [2.1, -1.2, 0.5, -0.8]
colors = ['green' if v > 0 else 'red' for v in advantage]
ax.bar(actions, advantage, color=colors, alpha=0.7)
ax.axhline(0, color='black', lw=1)
ax.set_title(r"Advantage $A(s, a)$", fontsize=12, fontweight='bold')
ax.set_ylabel("Value")
def plot_policy_improvement(ax):
"""Dynamic Programming: Policy Improvement (Before vs After)."""
ax.axis('off')
ax.set_title("Policy Improvement", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, r"$\pi_{old}$", fontsize=15, bbox=dict(boxstyle="round", fc="lightgrey"))
ax.annotate("", xy=(0.8, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->", lw=2))
ax.text(0.5, 0.6, "Greedy\nImprovement", ha='center', fontsize=9)
ax.text(0.85, 0.5, r"$\pi_{new}$", fontsize=15, bbox=dict(boxstyle="round", fc="lightgreen"))
def plot_value_iteration_backup(ax):
"""Dynamic Programming: Value Iteration Backup Diagram (Max over actions)."""
G = nx.DiGraph()
pos = {"s": (0.5, 1), "max": (0.5, 0.5), "s1": (0.2, 0), "s2": (0.5, 0), "s3": (0.8, 0)}
G.add_nodes_from(pos.keys())
G.add_edges_from([("s", "max"), ("max", "s1"), ("max", "s2"), ("max", "s3")])
nx.draw_networkx_nodes(ax=ax, G=G, pos=pos, node_size=500, node_color='white', edgecolors='black')
nx.draw_networkx_edges(ax=ax, G=G, pos=pos, arrows=True)
nx.draw_networkx_labels(ax=ax, G=G, pos=pos, labels={"s": "s", "max": "max", "s1": "s'", "s2": "s'", "s3": "s'"}, font_size=9)
ax.set_title("Value Iteration Backup", fontsize=12, fontweight='bold')
ax.axis('off')
def plot_policy_iteration_cycle(ax):
"""Dynamic Programming: Policy Iteration Full Cycle Flowchart."""
ax.axis('off')
ax.set_title("Policy Iteration Cycle", fontsize=12, fontweight='bold')
props = dict(boxstyle="round", fc="aliceblue", ec="black")
ax.text(0.5, 0.8, r"Policy Evaluation" + "\n" + r"$V \leftarrow V^\pi$", ha="center", bbox=props)
ax.text(0.5, 0.2, r"Policy Improvement" + "\n" + r"$\pi \leftarrow \text{greedy}(V)$", ha="center", bbox=props)
ax.annotate("", xy=(0.7, 0.3), xytext=(0.7, 0.7), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.5"))
ax.annotate("", xy=(0.3, 0.7), xytext=(0.3, 0.3), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.5"))
def plot_mc_backup(ax):
"""Monte Carlo: Backup diagram (Full trajectory until terminal sT)."""
ax.axis('off')
ax.set_title("Monte Carlo Backup", fontsize=12, fontweight='bold')
nodes = ['s', 's1', 's2', 'sT']
pos = {n: (0.5, 0.9 - i*0.25) for i, n in enumerate(nodes)}
for i in range(len(nodes)-1):
ax.annotate("", xy=pos[nodes[i+1]], xytext=pos[nodes[i]], arrowprops=dict(arrowstyle="->", lw=1.5))
ax.text(pos[nodes[i]][0]+0.05, pos[nodes[i]][1], nodes[i], va='center')
ax.text(pos['sT'][0]+0.05, pos['sT'][1], 'sT', va='center', fontweight='bold')
ax.annotate("Update V(s) using G", xy=(0.3, 0.9), xytext=(0.3, 0.15), arrowprops=dict(arrowstyle="->", color='red', connectionstyle="arc3,rad=0.3"))
def plot_mcts(ax):
"""Monte Carlo: Monte Carlo Tree Search (MCTS) tree diagram."""
G = nx.balanced_tree(2, 2, create_using=nx.DiGraph())
pos = nx.drawing.nx_agraph.graphviz_layout(G, prog='dot') if 'pygraphviz' in globals() else nx.shell_layout(G)
# Simple tree fallback
pos = {0:(0,0), 1:(-1,-1), 2:(1,-1), 3:(-1.5,-2), 4:(-0.5,-2), 5:(0.5,-2), 6:(1.5,-2)}
nx.draw_networkx_nodes(ax=ax, G=G, pos=pos, node_size=300, node_color='lightyellow', edgecolors='black')
nx.draw_networkx_edges(ax=ax, G=G, pos=pos, arrows=True)
ax.set_title("MCTS Tree", fontsize=12, fontweight='bold')
ax.axis('off')
def plot_importance_sampling(ax):
"""Monte Carlo: Importance Sampling Ratio Flow."""
ax.axis('off')
ax.set_title("Importance Sampling", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, r"$\pi(a|s)$", bbox=dict(boxstyle="circle", fc="lightgreen"), ha='center')
ax.text(0.5, 0.2, r"$b(a|s)$", bbox=dict(boxstyle="circle", fc="lightpink"), ha='center')
ax.annotate(r"$\rho = \frac{\pi}{b}$", xy=(0.7, 0.5), fontsize=15)
ax.annotate("", xy=(0.5, 0.35), xytext=(0.5, 0.65), arrowprops=dict(arrowstyle="<->", lw=2))
def plot_td_backup(ax):
"""Temporal Difference: TD(0) 1-step backup."""
ax.axis('off')
ax.set_title("TD(0) Backup", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "s", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.text(0.5, 0.2, "s'", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.annotate(r"$R + \gamma V(s')$", xy=(0.5, 0.4), ha='center', color='blue')
ax.annotate("", xy=(0.5, 0.35), xytext=(0.5, 0.65), arrowprops=dict(arrowstyle="<-", lw=2))
def plot_nstep_td(ax):
"""Temporal Difference: n-step TD backup."""
ax.axis('off')
ax.set_title("n-step TD Backup", fontsize=12, fontweight='bold')
for i in range(4):
ax.text(0.5, 0.9-i*0.2, f"s_{i}", bbox=dict(boxstyle="circle", fc="white"), ha='center', fontsize=8)
if i < 3: ax.annotate("", xy=(0.5, 0.75-i*0.2), xytext=(0.5, 0.85-i*0.2), arrowprops=dict(arrowstyle="->"))
ax.annotate(r"$G_t^{(n)}$", xy=(0.7, 0.5), fontsize=12, color='red')
def plot_eligibility_traces(ax):
"""Temporal Difference: TD(lambda) Eligibility Traces decay curve."""
t = np.arange(0, 50)
# Simulate multiple highlights (visits)
trace = np.zeros_like(t, dtype=float)
visits = [5, 20, 35]
for v in visits:
trace[v:] += (0.8 ** np.arange(len(t)-v))
ax.plot(t, trace, color='brown', lw=2)
ax.set_title(r"Eligibility Trace $z_t(\lambda)$", fontsize=12, fontweight='bold')
ax.set_xlabel("Time")
ax.fill_between(t, trace, color='brown', alpha=0.1)
def plot_sarsa_backup(ax):
"""Temporal Difference: SARSA (On-policy) backup."""
ax.axis('off')
ax.set_title("SARSA Backup", fontsize=12, fontweight='bold')
ax.text(0.5, 0.9, "(s,a)", ha='center')
ax.text(0.5, 0.1, "(s',a')", ha='center')
ax.annotate("", xy=(0.5, 0.2), xytext=(0.5, 0.8), arrowprops=dict(arrowstyle="<-", lw=2, color='orange'))
ax.text(0.6, 0.5, "On-policy", rotation=90)
def plot_q_learning_backup(ax):
"""Temporal Difference: Q-Learning (Off-policy) backup."""
ax.axis('off')
ax.set_title("Q-Learning Backup", fontsize=12, fontweight='bold')
ax.text(0.5, 0.9, "(s,a)", ha='center')
ax.text(0.5, 0.1, r"$\max_{a'} Q(s',a')$", ha='center', bbox=dict(boxstyle="round", fc="lightcyan"))
ax.annotate("", xy=(0.5, 0.25), xytext=(0.5, 0.8), arrowprops=dict(arrowstyle="<-", lw=2, color='blue'))
def plot_double_q(ax):
"""Temporal Difference: Double Q-Learning / Double DQN."""
ax.axis('off')
ax.set_title("Double Q-Learning", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Network A", bbox=dict(fc="lightyellow"), ha='center')
ax.text(0.5, 0.2, "Network B", bbox=dict(fc="lightcyan"), ha='center')
ax.annotate("Select $a^*$", xy=(0.3, 0.8), xytext=(0.5, 0.85), arrowprops=dict(arrowstyle="->"))
ax.annotate("Eval $Q(s', a^*)$", xy=(0.7, 0.2), xytext=(0.5, 0.15), arrowprops=dict(arrowstyle="->"))
def plot_dueling_dqn(ax):
"""Temporal Difference: Dueling DQN Architecture."""
ax.axis('off')
ax.set_title("Dueling DQN", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "Backbone", bbox=dict(fc="lightgrey"), ha='center', rotation=90)
ax.text(0.5, 0.7, "V(s)", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.5, 0.3, "A(s,a)", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.9, 0.5, "Q(s,a)", bbox=dict(boxstyle="circle", fc="orange"), ha='center')
ax.annotate("", xy=(0.35, 0.7), xytext=(0.15, 0.55), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.35, 0.3), xytext=(0.15, 0.45), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.75, 0.55), xytext=(0.6, 0.7), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.75, 0.45), xytext=(0.6, 0.3), arrowprops=dict(arrowstyle="->"))
def plot_prioritized_replay(ax):
"""Temporal Difference: Prioritized Experience Replay (PER)."""
priorities = np.random.pareto(3, 100)
ax.hist(priorities, bins=20, color='teal', alpha=0.7)
ax.set_title("Prioritized Replay (TD-Error)", fontsize=12, fontweight='bold')
ax.set_xlabel("Priority $P_i$")
ax.set_ylabel("Count")
def plot_rainbow_dqn(ax):
"""Temporal Difference: Rainbow DQN Composite."""
ax.axis('off')
ax.set_title("Rainbow DQN", fontsize=12, fontweight='bold')
features = ["Double", "Dueling", "PER", "Noisy", "Distributional", "n-step"]
for i, f in enumerate(features):
ax.text(0.5, 0.9 - i*0.15, f, ha='center', bbox=dict(boxstyle="round", fc="ghostwhite"), fontsize=8)
def plot_linear_fa(ax):
"""Function Approximation: Linear Function Approximation."""
ax.axis('off')
ax.set_title("Linear Function Approx", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, r"$\phi(s)$ Features", ha='center', bbox=dict(fc="white"))
ax.text(0.5, 0.2, r"$w^T \phi(s)$", ha='center', bbox=dict(fc="lightgrey"))
ax.annotate("", xy=(0.5, 0.35), xytext=(0.5, 0.65), arrowprops=dict(arrowstyle="->", lw=2))
def plot_nn_layers(ax):
"""Function Approximation: Neural Network Layers diagram."""
ax.axis('off')
ax.set_title("NN Layers (Deep RL)", fontsize=12, fontweight='bold')
layers = [4, 8, 8, 2]
for i, l in enumerate(layers):
for j in range(l):
ax.scatter(i*0.3, j*0.1 - l*0.05, s=20, c='black')
ax.set_xlim(-0.1, 1.0)
ax.set_ylim(-0.5, 0.5)
def plot_computation_graph(ax):
"""Function Approximation: Computation Graph / Backprop Flow."""
ax.axis('off')
ax.set_title("Computation Graph (DAG)", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "Input", bbox=dict(boxstyle="circle", fc="white"))
ax.text(0.5, 0.5, "Op", bbox=dict(boxstyle="square", fc="lightgrey"))
ax.text(0.9, 0.5, "Loss", bbox=dict(boxstyle="circle", fc="salmon"))
ax.annotate("", xy=(0.35, 0.5), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.75, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("Grad", xy=(0.1, 0.3), xytext=(0.9, 0.3), arrowprops=dict(arrowstyle="->", color='red', connectionstyle="arc3,rad=0.2"))
def plot_target_network(ax):
"""Function Approximation: Target Network concept."""
ax.axis('off')
ax.set_title("Target Network Updates", fontsize=12, fontweight='bold')
ax.text(0.3, 0.8, r"$Q_\theta$ (Active)", bbox=dict(fc="lightgreen"))
ax.text(0.7, 0.8, r"$Q_{\theta^-}$ (Target)", bbox=dict(fc="lightblue"))
ax.annotate("periodic copy", xy=(0.6, 0.8), xytext=(0.4, 0.8), arrowprops=dict(arrowstyle="<-", ls='--'))
def plot_ppo_clip(ax):
"""Policy Gradients: PPO Clipped Surrogate Objective."""
epsilon = 0.2
r = np.linspace(0.5, 1.5, 100)
advantage = 1.0
surr1 = r * advantage
surr2 = np.clip(r, 1-epsilon, 1+epsilon) * advantage
ax.plot(r, surr1, '--', label="r*A")
ax.plot(r, np.minimum(surr1, surr2), 'r', label="min(r*A, clip*A)")
ax.set_title("PPO-Clip Objective", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
ax.axvline(1, color='gray', linestyle=':')
def plot_trpo_trust_region(ax):
"""Policy Gradients: TRPO Trust Region / KL Constraint."""
ax.set_title("TRPO Trust Region", fontsize=12, fontweight='bold')
circle = plt.Circle((0.5, 0.5), 0.3, color='blue', fill=False, label="KL Constraint")
ax.add_artist(circle)
ax.scatter(0.5, 0.5, c='black', label=r"$\pi_{old}$")
ax.arrow(0.5, 0.5, 0.15, 0.1, head_width=0.03, color='red', label="Update")
ax.set_xlim(0, 1); ax.set_ylim(0, 1)
ax.axis('off')
def plot_a3c_multi_worker(ax):
"""Actor-Critic: Asynchronous Multi-worker (A3C)."""
ax.axis('off')
ax.set_title("A3C Multi-worker", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Global Parameters", bbox=dict(fc="gold"), ha='center')
for i in range(3):
ax.text(0.2 + i*0.3, 0.2, f"Worker {i+1}", bbox=dict(fc="lightgrey"), ha='center', fontsize=8)
ax.annotate("", xy=(0.5, 0.7), xytext=(0.2 + i*0.3, 0.3), arrowprops=dict(arrowstyle="<->"))
def plot_sac_arch(ax):
"""Actor-Critic: SAC (Entropy-regularized)."""
ax.axis('off')
ax.set_title("SAC Architecture", fontsize=12, fontweight='bold')
ax.text(0.5, 0.7, "Actor", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.5, 0.3, "Entropy Bonus", bbox=dict(fc="salmon"), ha='center')
ax.text(0.1, 0.5, "State", ha='center')
ax.text(0.9, 0.5, "Action", ha='center')
ax.annotate("", xy=(0.4, 0.7), xytext=(0.15, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.5, 0.55), xytext=(0.5, 0.4), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.85, 0.5), xytext=(0.6, 0.7), arrowprops=dict(arrowstyle="->"))
def plot_softmax_exploration(ax):
"""Exploration: Softmax / Boltzmann probabilities."""
x = np.arange(4)
logits = [1, 2, 5, 3]
for tau in [0.5, 1.0, 5.0]:
probs = np.exp(np.array(logits)/tau)
probs /= probs.sum()
ax.plot(x, probs, marker='o', label=rf"$\tau={tau}$")
ax.set_title("Softmax Exploration", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
ax.set_xticks(x)
def plot_ucb_confidence(ax):
"""Exploration: Upper Confidence Bound (UCB)."""
actions = ['A1', 'A2', 'A3']
means = [0.6, 0.8, 0.5]
conf = [0.3, 0.1, 0.4]
ax.bar(actions, means, yerr=conf, capsize=10, color='skyblue', label='Mean Q')
ax.set_title("UCB Action Values", fontsize=12, fontweight='bold')
ax.set_ylim(0, 1.2)
def plot_intrinsic_motivation(ax):
"""Exploration: Intrinsic Motivation / Curiosity."""
ax.axis('off')
ax.set_title("Intrinsic Motivation", fontsize=12, fontweight='bold')
ax.text(0.3, 0.5, "World Model", bbox=dict(fc="lightyellow"), ha='center')
ax.text(0.7, 0.5, "Prediction\nError", bbox=dict(boxstyle="circle", fc="orange"), ha='center')
ax.annotate("", xy=(0.58, 0.5), xytext=(0.42, 0.5), arrowprops=dict(arrowstyle="->"))
ax.text(0.85, 0.5, r"$R_{int}$", fontweight='bold')
def plot_entropy_bonus(ax):
"""Exploration: Entropy Regularization curve."""
p = np.linspace(0.01, 0.99, 50)
entropy = -(p * np.log(p) + (1-p) * np.log(1-p))
ax.plot(p, entropy, color='purple')
ax.set_title(r"Entropy $H(\pi)$", fontsize=12, fontweight='bold')
ax.set_xlabel("$P(a)$")
def plot_options_framework(ax):
"""Hierarchical RL: Options Framework."""
ax.axis('off')
ax.set_title("Options Framework", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, r"High-level policy" + "\n" + r"$\pi_{hi}$", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.2, 0.2, "Option 1", bbox=dict(fc="ivory"), ha='center')
ax.text(0.8, 0.2, "Option 2", bbox=dict(fc="ivory"), ha='center')
ax.annotate("", xy=(0.3, 0.3), xytext=(0.45, 0.7), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.7, 0.3), xytext=(0.55, 0.7), arrowprops=dict(arrowstyle="->"))
def plot_feudal_networks(ax):
"""Hierarchical RL: Feudal Networks / Hierarchy."""
ax.axis('off')
ax.set_title("Feudal Networks", fontsize=12, fontweight='bold')
ax.text(0.5, 0.85, "Manager", bbox=dict(fc="plum"), ha='center')
ax.text(0.5, 0.15, "Worker", bbox=dict(fc="wheat"), ha='center')
ax.annotate("Goal $g_t$", xy=(0.5, 0.3), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->", lw=2))
def plot_world_model(ax):
"""Model-Based RL: Learned Dynamics Model."""
ax.axis('off')
ax.set_title("World Model (Dynamics)", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "(s,a)", ha='center')
ax.text(0.5, 0.5, r"$\hat{P}$", bbox=dict(boxstyle="circle", fc="lightgrey"), ha='center')
ax.text(0.9, 0.7, r"$\hat{s}'$", ha='center')
ax.text(0.9, 0.3, r"$\hat{r}$", ha='center')
ax.annotate("", xy=(0.4, 0.5), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.65), xytext=(0.6, 0.55), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.35), xytext=(0.6, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_model_planning(ax):
"""Model-Based RL: Planning / Rollouts in imagination."""
ax.axis('off')
ax.set_title("Model-Based Planning", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "Real s", ha='center', fontweight='bold')
for i in range(3):
ax.annotate("", xy=(0.3+i*0.2, 0.5+(i%2)*0.1), xytext=(0.1+i*0.2, 0.5), arrowprops=dict(arrowstyle="->", color='gray'))
ax.text(0.3+i*0.2, 0.55+(i%2)*0.1, "imagined", fontsize=7)
def plot_offline_rl(ax):
"""Offline RL: Fixed dataset of trajectories."""
ax.axis('off')
ax.set_title("Offline RL Dataset", fontsize=12, fontweight='bold')
ax.text(0.5, 0.5, r"Static" + "\n" + r"Dataset" + "\n" + r"$\mathcal{D}$", bbox=dict(boxstyle="round", fc="lightgrey"), ha='center')
ax.annotate("No interaction", xy=(0.5, 0.9), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->", color='red'))
ax.scatter([0.2, 0.8, 0.3, 0.7], [0.8, 0.8, 0.2, 0.2], marker='x', color='blue')
def plot_cql_regularization(ax):
"""Offline RL: CQL regularization visualization."""
q = np.linspace(-5, 5, 100)
penalty = q**2 * 0.1
ax.plot(q, penalty, 'r', label='CQL Penalty')
ax.set_title("CQL Regularization", fontsize=12, fontweight='bold')
ax.set_xlabel("Q-value")
ax.legend(fontsize=8)
def plot_multi_agent_interaction(ax):
"""Multi-Agent RL: Agents communicating or competing."""
G = nx.complete_graph(3)
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(ax=ax, G=G, pos=pos, node_size=500, node_color=['red', 'blue', 'green'])
nx.draw_networkx_edges(ax=ax, G=G, pos=pos, style='dashed')
ax.set_title("Multi-Agent Interaction", fontsize=12, fontweight='bold')
ax.axis('off')
def plot_ctde(ax):
"""Multi-Agent RL: Centralized Training Decentralized Execution (CTDE)."""
ax.axis('off')
ax.set_title("CTDE Architecture", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Centralized Critic", bbox=dict(fc="gold"), ha='center')
ax.text(0.2, 0.2, "Agent 1", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.2, "Agent 2", bbox=dict(fc="lightblue"), ha='center')
ax.annotate("", xy=(0.5, 0.7), xytext=(0.25, 0.35), arrowprops=dict(arrowstyle="<-", color='gray'))
ax.annotate("", xy=(0.5, 0.7), xytext=(0.75, 0.35), arrowprops=dict(arrowstyle="<-", color='gray'))
def plot_payoff_matrix(ax):
"""Multi-Agent RL: Cooperative / Competitive Payoff Matrix."""
matrix = np.array([[(3,3), (0,5)], [(5,0), (1,1)]])
ax.axis('off')
ax.set_title("Payoff Matrix (Prisoner's)", fontsize=12, fontweight='bold')
for i in range(2):
for j in range(2):
ax.text(j, 1-i, str(matrix[i, j]), ha='center', va='center', bbox=dict(fc="white"))
ax.set_xlim(-0.5, 1.5); ax.set_ylim(-0.5, 1.5)
def plot_irl_reward_inference(ax):
"""Inverse RL: Infer reward from expert demonstrations."""
ax.axis('off')
ax.set_title("Inferred Reward Heatmap", fontsize=12, fontweight='bold')
grid = np.zeros((5, 5))
grid[2:4, 2:4] = 1.0 # Expert path
ax.imshow(grid, cmap='hot')
def plot_gail_flow(ax):
"""Inverse RL: GAIL (Generative Adversarial Imitation Learning)."""
ax.axis('off')
ax.set_title("GAIL Architecture", fontsize=12, fontweight='bold')
ax.text(0.2, 0.8, "Expert Data", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.2, 0.2, "Policy (Gen)", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.8, 0.5, "Discriminator", bbox=dict(boxstyle="square", fc="salmon"), ha='center')
ax.annotate("", xy=(0.6, 0.55), xytext=(0.35, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.6, 0.45), xytext=(0.35, 0.25), arrowprops=dict(arrowstyle="->"))
def plot_meta_rl_nested_loop(ax):
"""Meta-RL: Outer loop (meta) + inner loop (adaptation)."""
ax.axis('off')
ax.set_title("Meta-RL Loops", fontsize=12, fontweight='bold')
ax.add_patch(plt.Circle((0.5, 0.5), 0.4, fill=False, ls='--'))
ax.add_patch(plt.Circle((0.5, 0.5), 0.2, fill=False))
ax.text(0.5, 0.5, "Inner\nLoop", ha='center', fontsize=8)
ax.text(0.5, 0.8, "Outer Loop", ha='center', fontsize=10)
def plot_task_distribution(ax):
"""Meta-RL: Multiple MDPs from distribution."""
ax.axis('off')
ax.set_title("Task Distribution", fontsize=12, fontweight='bold')
for i in range(3):
ax.text(0.2 + i*0.3, 0.5, f"Task {i+1}", bbox=dict(boxstyle="round", fc="ivory"), fontsize=8)
ax.annotate("sample", xy=(0.5, 0.8), xytext=(0.5, 0.6), arrowprops=dict(arrowstyle="<-"))
def plot_replay_buffer(ax):
"""Advanced: Experience Replay Buffer (FIFO)."""
ax.axis('off')
ax.set_title("Experience Replay Buffer", fontsize=12, fontweight='bold')
for i in range(5):
ax.add_patch(plt.Rectangle((0.1+i*0.15, 0.4), 0.1, 0.2, fill=True, color='lightgrey'))
ax.text(0.15+i*0.15, 0.5, f"e_{i}", ha='center')
ax.annotate("In", xy=(0.05, 0.5), xytext=(-0.1, 0.5), arrowprops=dict(arrowstyle="->"), annotation_clip=False)
ax.annotate("Out (Batch)", xy=(0.85, 0.5), xytext=(1.0, 0.5), arrowprops=dict(arrowstyle="<-"), annotation_clip=False)
def plot_state_visitation(ax):
"""Advanced: State Visitation / Occupancy Measure."""
data = np.random.multivariate_normal([0, 0], [[1, 0.5], [0.5, 1]], 1000)
ax.hexbin(data[:, 0], data[:, 1], gridsize=15, cmap='Blues')
ax.set_title("State Visitation Heatmap", fontsize=12, fontweight='bold')
def plot_regret_curve(ax):
"""Advanced: Regret / Cumulative Regret."""
t = np.arange(100)
regret = np.sqrt(t) + np.random.normal(0, 0.5, 100)
ax.plot(t, regret, color='red', label='Sub-linear Regret')
ax.set_title("Cumulative Regret", fontsize=12, fontweight='bold')
ax.set_xlabel("Time")
ax.legend(fontsize=8)
def plot_attention_weights(ax):
"""Advanced: Attention Mechanisms (Heatmap)."""
weights = np.random.rand(5, 5)
ax.imshow(weights, cmap='viridis')
ax.set_title("Attention Weight Matrix", fontsize=12, fontweight='bold')
ax.set_xticks([]); ax.set_yticks([])
def plot_diffusion_policy(ax):
"""Advanced: Diffusion Policy denoising steps."""
ax.axis('off')
ax.set_title("Diffusion Policy (Denoising)", fontsize=12, fontweight='bold')
for i in range(4):
ax.scatter(0.1+i*0.25, 0.5, s=100/(i+1), c='black', alpha=1.0 - i*0.2)
if i < 3: ax.annotate("", xy=(0.25+i*0.25, 0.5), xytext=(0.15+i*0.25, 0.5), arrowprops=dict(arrowstyle="->"))
ax.text(0.5, 0.3, "Noise $\\rightarrow$ Action", ha='center', fontsize=8)
def plot_gnn_rl(ax):
"""Advanced: Graph Neural Networks for RL."""
G = nx.star_graph(4)
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(ax=ax, G=G, pos=pos, node_size=200, node_color='orange')
nx.draw_networkx_edges(ax=ax, G=G, pos=pos)
ax.set_title("GNN Message Passing", fontsize=12, fontweight='bold')
ax.axis('off')
def plot_latent_space(ax):
"""Advanced: World Model / Latent Space."""
ax.axis('off')
ax.set_title("Latent Space (VAE/Dreamer)", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "Image", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.5, 0.5, "Latent $z$", bbox=dict(boxstyle="circle", fc="lightpink"), ha='center')
ax.text(0.9, 0.5, "Reconstruction", bbox=dict(fc="lightgrey"), ha='center')
ax.annotate("", xy=(0.4, 0.5), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_convergence_log(ax):
"""Advanced: Convergence Analysis Plots (Log-scale)."""
iterations = np.arange(1, 100)
error = 10 / iterations**2
ax.loglog(iterations, error, color='green')
ax.set_title("Value Convergence (Log)", fontsize=12, fontweight='bold')
ax.set_xlabel("Iterations")
ax.set_ylabel("Error")
ax.grid(True, which="both", ls="-", alpha=0.3)
def plot_expected_sarsa_backup(ax):
"""Temporal Difference: Expected SARSA (Expectation over policy)."""
ax.axis('off')
ax.set_title("Expected SARSA Backup", fontsize=12, fontweight='bold')
ax.text(0.5, 0.9, "(s,a)", ha='center')
ax.text(0.5, 0.1, r"$\sum_{a'} \pi(a'|s') Q(s',a')$", ha='center', bbox=dict(boxstyle="round", fc="ivory"))
ax.annotate("", xy=(0.5, 0.25), xytext=(0.5, 0.8), arrowprops=dict(arrowstyle="<-", lw=2, color='purple'))
def plot_reinforce_flow(ax):
"""Policy Gradients: REINFORCE (Full trajectory flow)."""
ax.axis('off')
ax.set_title("REINFORCE Flow", fontsize=12, fontweight='bold')
steps = ["s0", "a0", "r1", "s1", "...", "GT"]
for i, s in enumerate(steps):
ax.text(0.1 + i*0.15, 0.5, s, bbox=dict(boxstyle="circle", fc="white"))
ax.annotate(r"$\nabla_\theta J \propto G_t \nabla \ln \pi$", xy=(0.5, 0.8), ha='center', fontsize=12, color='darkgreen')
def plot_advantage_scaled_grad(ax):
"""Policy Gradients: Baseline / Advantage scaled gradient."""
ax.axis('off')
ax.set_title("Baseline Subtraction", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, r"$(G_t - b(s))$", bbox=dict(fc="salmon"), ha='center')
ax.text(0.5, 0.3, r"Scale $\nabla \ln \pi$", ha='center')
ax.annotate("", xy=(0.5, 0.4), xytext=(0.5, 0.7), arrowprops=dict(arrowstyle="->"))
def plot_skill_discovery(ax):
"""Hierarchical RL: Skill Discovery (Unsupervised clusters)."""
np.random.seed(0)
for i in range(3):
center = np.random.randn(2) * 2
pts = np.random.randn(20, 2) * 0.5 + center
ax.scatter(pts[:, 0], pts[:, 1], alpha=0.6, label=f"Skill {i+1}")
ax.set_title("Skill Embedding Space", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_imagination_rollout(ax):
"""Model-Based RL: Imagination-Augmented Rollouts (I2A)."""
ax.axis('off')
ax.set_title("Imagination Rollout (I2A)", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "Input s", ha='center')
ax.add_patch(plt.Rectangle((0.3, 0.3), 0.4, 0.4, fill=True, color='lavender'))
ax.text(0.5, 0.5, "Imagination\nModule", ha='center')
ax.annotate("Imagined Paths", xy=(0.8, 0.5), xytext=(0.5, 0.5), arrowprops=dict(arrowstyle="->", color='gray', connectionstyle="arc3,rad=0.3"))
def plot_policy_gradient_flow(ax):
"""Policy Gradients: Gradient flow from reward to log-prob (DAG)."""
ax.axis('off')
ax.set_title("Policy Gradient Flow (DAG)", fontsize=12, fontweight='bold')
bbox_props = dict(boxstyle="round,pad=0.5", fc="lightgrey", ec="black", lw=1.5)
ax.text(0.1, 0.8, r"Trajectory $\tau$", ha="center", va="center", bbox=bbox_props)
ax.text(0.5, 0.8, r"Reward $R(\tau)$", ha="center", va="center", bbox=bbox_props)
ax.text(0.1, 0.2, r"Log-Prob $\log \pi_\theta$", ha="center", va="center", bbox=bbox_props)
ax.text(0.7, 0.5, r"$\nabla_\theta J(\theta)$", ha="center", va="center", bbox=dict(boxstyle="circle,pad=0.3", fc="gold", ec="black"))
# Draw arrows
ax.annotate("", xy=(0.35, 0.8), xytext=(0.2, 0.8), arrowprops=dict(arrowstyle="->", lw=2))
ax.annotate("", xy=(0.7, 0.65), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->", lw=2))
ax.annotate("", xy=(0.6, 0.4), xytext=(0.25, 0.2), arrowprops=dict(arrowstyle="->", lw=2))
def plot_rl_as_inference_pgm(ax):
"""PGM: RL as Inference (Control as Inference)."""
ax.axis('off')
ax.set_title("RL as Inference (PGM)", fontsize=12, fontweight='bold')
nodes = {
's_t': (0.1, 0.8), 'a_t': (0.1, 0.4), 's_tp1': (0.5, 0.8),
'r_t': (0.5, 0.4), 'O_t': (0.8, 0.4)
}
for name, pos in nodes.items():
color = 'white' if 'O' not in name else 'lightcoral'
ax.text(pos[0], pos[1], name, bbox=dict(boxstyle="circle", fc=color), ha='center')
# Dependencies
arrows = [('s_t', 's_tp1'), ('a_t', 's_tp1'), ('s_t', 'a_t'), ('a_t', 'r_t'), ('r_t', 'O_t')]
for start, end in arrows:
ax.annotate("", xy=nodes[end], xytext=nodes[start], arrowprops=dict(arrowstyle="->"))
def plot_rl_taxonomy_tree(ax):
"""Taxonomy: RL Algorithm Classification Tree."""
ax.axis('off')
ax.set_title("RL Algorithm Taxonomy", fontsize=12, fontweight='bold')
ax.text(0.5, 0.9, "Reinforcement Learning", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.25, 0.6, "Model-Free", bbox=dict(fc="ivory"), ha='center')
ax.text(0.75, 0.6, "Model-Based", bbox=dict(fc="ivory"), ha='center')
ax.text(0.1, 0.3, "Policy Opt", fontsize=8, ha='center')
ax.text(0.4, 0.3, "Value-Based", fontsize=8, ha='center')
for x in [0.25, 0.75]: ax.annotate("", xy=(x, 0.65), xytext=(0.5, 0.85), arrowprops=dict(arrowstyle="->"))
for x in [0.1, 0.4]: ax.annotate("", xy=(x, 0.35), xytext=(0.25, 0.55), arrowprops=dict(arrowstyle="->"))
def plot_distributional_rl_atoms(ax):
"""Distributional RL: C51 return probability atoms."""
returns = np.linspace(-10, 10, 51)
probs = np.exp(-(returns - 2)**2 / 4) + np.exp(-(returns + 4)**2 / 2)
probs /= probs.sum()
ax.bar(returns, probs, width=0.3, color='steelblue', alpha=0.8)
ax.set_title("Distributional RL (Atoms)", fontsize=12, fontweight='bold')
ax.set_xlabel("Return $Z$")
ax.set_ylabel("Probability")
def plot_her_goal_relabeling(ax):
"""HER: Hindsight Experience Replay goal relabeling."""
ax.axis('off')
ax.set_title("HER Goal Relabeling", fontsize=12, fontweight='bold')
path = np.array([[0.1, 0.2], [0.3, 0.4], [0.6, 0.5], [0.8, 0.7]])
ax.plot(path[:, 0], path[:, 1], 'k--', alpha=0.3)
ax.scatter(path[:, 0], path[:, 1], c='black', s=20)
ax.text(0.9, 0.9, "True Goal G", color='red', fontweight='bold', ha='center')
ax.text(0.8, 0.6, "Relabeled G'", color='blue', fontweight='bold', ha='center')
ax.annotate("", xy=(0.8, 0.7), xytext=(0.8, 0.63), arrowprops=dict(arrowstyle="->", color='blue'))
def plot_dyna_q_flow(ax):
"""Dyna-Q: Real interaction + Model-based planning flow."""
ax.axis('off')
ax.set_title("Dyna-Q Architecture", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Agent Policy", bbox=dict(fc="white"), ha='center')
ax.text(0.2, 0.5, "Real World", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.8, 0.5, "Model", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.5, 0.2, "Value Function / Q", bbox=dict(fc="gold"), ha='center')
# Loop
ax.annotate("Direct RL", xy=(0.35, 0.25), xytext=(0.2, 0.45), arrowprops=dict(arrowstyle="->"))
ax.annotate("Planning", xy=(0.65, 0.25), xytext=(0.8, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_noisy_nets_parameters(ax):
"""Noisy Nets: Parameter noise distribution σ for weights."""
x = np.linspace(-3, 3, 100)
y = np.exp(-x**2 / 2) # Base weight (constant)
ax.plot(x, y, color='black', label=r"$\mu$ (Mean)")
ax.fill_between(x, y-0.2, y+0.2, color='gray', alpha=0.3, label=r"$\sigma \cdot \epsilon$ (Noise)")
ax.set_title("Noisy Nets Parameter Noise", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_icm_curiosity(ax):
"""Exploration: Intrinsic Curiosity Module (ICM)."""
ax.axis('off')
ax.set_title("ICM: Inverse & Forward Models", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "s_t, s_t+1", ha='center')
ax.text(0.5, 0.8, "Inverse Model", bbox=dict(fc="ivory"), ha='center')
ax.text(0.5, 0.2, "Forward Model", bbox=dict(fc="ivory"), ha='center')
ax.text(0.9, 0.5, "Intrinsic Reward", ha='center', color='red')
ax.annotate("", xy=(0.35, 0.75), xytext=(0.2, 0.55), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.35, 0.25), xytext=(0.2, 0.45), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.5), xytext=(0.65, 0.3), arrowprops=dict(arrowstyle="->"))
def plot_v_trace_impala(ax):
"""IMPALA: V-trace asynchronous importance sampling."""
ax.axis('off')
ax.set_title("V-trace (IMPALA)", fontsize=12, fontweight='bold')
for i in range(4):
h = 0.5 + 0.3*np.sin(i)
ax.bar(0.2+i*0.2, h, width=0.1, color='teal')
ax.text(0.2+i*0.2, h+0.05, rf"$\rho_{i}$", ha='center', fontsize=8)
ax.axhline(0.5, ls='--', color='red', label="Clipped $\\rho$")
ax.set_ylim(0, 1.2)
def plot_qmix_mixing_net(ax):
"""Multi-Agent RL: QMIX Mixing Network."""
ax.axis('off')
ax.set_title("QMIX Architecture", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Mixing Network", bbox=dict(boxstyle="round,pad=1", fc="gold"), ha='center')
for i in range(3):
ax.text(0.2+i*0.3, 0.4, f"Agent {i+1} Q", bbox=dict(fc="grey"), ha='center', fontsize=7)
ax.annotate("", xy=(0.5, 0.65), xytext=(0.2+i*0.3, 0.45), arrowprops=dict(arrowstyle="->"))
ax.text(0.5, 0.1, "Global State s", ha='center')
ax.annotate("hypernets", xy=(0.5, 0.68), xytext=(0.5, 0.2), arrowprops=dict(arrowstyle="->", ls=':'))
def plot_saliency_heatmaps(ax):
"""Interpretability: Attention/Saliency Heatmap on input."""
# Dummy "state" (e.g. Breakout screen)
img = np.zeros((20, 20))
img[15, 8:12] = 1.0 # Paddle
img[5:7, 5:15] = 0.5 # Bricks
heatmap = np.random.rand(20, 20) * 0.5
heatmap[14:17, 7:13] = 1.0 # High attention on paddle
ax.imshow(img, cmap='gray')
ax.imshow(heatmap, cmap='hot', alpha=0.5)
ax.set_title("Action Saliency Heatmap", fontsize=12, fontweight='bold')
ax.axis('off')
def plot_action_selection_noise(ax):
"""Exploration: OU-noise vs Gaussian Noise paths."""
t = np.arange(100)
gaussian = np.random.normal(0, 0.1, 100)
ou = np.zeros(100)
for i in range(1, 100):
ou[i] = ou[i-1] * 0.9 + np.random.normal(0, 0.1)
ax.plot(t, gaussian, label="Gaussian", alpha=0.5)
ax.plot(t, ou, label="Ornstein-Uhlenbeck", color='red')
ax.set_title("Action Selection Noise", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_tsne_state_embeddings(ax):
"""Interpretability: t-SNE / UMAP State Clusters."""
np.random.seed(42)
for i in range(3):
center = np.random.randn(2) * 5
pts = np.random.randn(30, 2) + center
ax.scatter(pts[:, 0], pts[:, 1], alpha=0.6, label=f"Cluster {i+1}")
ax.set_title("t-SNE State Embeddings", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_loss_landscape(fig, gs):
"""Optimization: Loss Landscape / Surface."""
ax = fig.add_subplot(gs[0, 0], projection='3d')
x = np.linspace(-2, 2, 30)
y = np.linspace(-2, 2, 30)
X, Y = np.meshgrid(x, y)
Z = X**2 + Y**2 + 0.5*np.sin(5*X) # Non-convex surface
ax.plot_surface(X, Y, Z, cmap='terrain', alpha=0.8)
ax.set_title("Policy Loss Landscape", fontsize=12, fontweight='bold')
def plot_success_rate_curve(ax):
"""Evaluation: Success Rate over training."""
steps = np.linspace(0, 1e6, 100)
success = 1.0 / (1.0 + np.exp(-1e-5 * (steps - 4e5))) # S-curve
ax.plot(steps, success, color='darkgreen', lw=2)
ax.set_title("Success Rate vs Steps", fontsize=12, fontweight='bold')
ax.set_ylim(-0.05, 1.05)
ax.grid(True, alpha=0.3)
def plot_hyperparameter_sensitivity(ax):
"""Analysis: Hyperparameter Sensitivity Heatmap."""
lr = [1e-5, 1e-4, 1e-3]
batches = [32, 64, 128]
data = np.array([[60, 85, 40], [75, 95, 80], [30, 50, 45]])
im = ax.imshow(data, cmap='RdYlGn')
ax.set_xticks(range(3)); ax.set_xticklabels(batches)
ax.set_yticks(range(3)); ax.set_yticklabels(lr)
ax.set_xlabel("Batch Size"); ax.set_ylabel("Learning Rate")
ax.set_title("Hyperparam Sensitivity", fontsize=12, fontweight='bold')
for (i, j), z in np.ndenumerate(data):
ax.text(j, i, f'{z}%', ha='center', va='center')
def plot_action_persistence(ax):
"""Dynamics: Action Persistence (Frame Skipping)."""
ax.axis('off')
ax.set_title("Action Persistence (k=4)", fontsize=12, fontweight='bold')
for i in range(2):
ax.add_patch(plt.Rectangle((0.1, 0.6-i*0.4), 0.8, 0.2, fill=False))
ax.text(0.5, 0.7-i*0.4, f"Action A_{i}", ha='center')
for j in range(4):
ax.add_patch(plt.Rectangle((0.1+j*0.2, 0.6-i*0.4), 0.2, 0.2, fill=True, alpha=0.2))
ax.text(0.5, 0.45, "Repeat Action for k frames", ha='center', color='blue', fontsize=8)
def plot_muzero_search_tree(ax):
"""Model-Based: MuZero Search Tree with dynamics."""
ax.axis('off')
ax.set_title("MuZero Search Tree", fontsize=12, fontweight='bold')
ax.text(0.5, 0.9, "Node $s$", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.text(0.3, 0.5, "Dyn $g$", bbox=dict(fc="lavender"), ha='center')
ax.text(0.3, 0.1, "Pred $f$", bbox=dict(fc="ivory"), ha='center')
ax.annotate("", xy=(0.3, 0.6), xytext=(0.5, 0.85), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.3, 0.2), xytext=(0.3, 0.4), arrowprops=dict(arrowstyle="->"))
def plot_policy_distillation(ax):
"""Deep RL: Policy Distillation (Teacher-Student)."""
ax.axis('off')
ax.set_title("Policy Distillation", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, r"Teacher $\pi_T$", bbox=dict(fc="gold"), ha='center')
ax.text(0.8, 0.5, r"Student $\pi_S$", bbox=dict(fc="lightgrey"), ha='center')
ax.annotate("KL-Divergence Loss", xy=(0.7, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->", lw=2, color='red'))
def plot_decision_transformer_tokens(ax):
"""Transformers: Token Sequence (DT/TT)."""
ax.axis('off')
ax.set_title("Decision Transformer Tokens", fontsize=12, fontweight='bold')
tokens = [r"$\hat{R}_t$", "$s_t$", "$a_t$", r"$\hat{R}_{t+1}$", "$s_{t+1}$"]
for i, t in enumerate(tokens):
ax.text(0.1+i*0.2, 0.5, t, bbox=dict(boxstyle="round", fc="white"))
ax.annotate("causal attention", xy=(0.5, 0.7), xytext=(0.5, 0.6), annotation_clip=False)
def plot_performance_profiles_rliable(ax):
"""Evaluation: Success Probability Profiles (rliable)."""
x = np.linspace(0, 1, 100)
y1 = x**2
y2 = np.sqrt(x)
ax.plot(x, y1, label="Algo A")
ax.plot(x, y2, label="Algo B")
ax.set_title("Performance Profiles", fontsize=12, fontweight='bold')
ax.set_xlabel("Normalized Score")
ax.set_ylabel("Probability of higher score")
ax.legend(fontsize=8)
def plot_safety_shielding(ax):
"""Safety RL: Action Shielding / Constraints."""
ax.axis('off')
ax.set_title("Safety Shielding", fontsize=12, fontweight='bold')
ax.add_patch(plt.Circle((0.5, 0.5), 0.4, fill=True, color='red', alpha=0.1))
ax.text(0.5, 0.5, "Forbidden\nRegion", ha='center', color='red')
ax.annotate("Shielded Action", xy=(0.2, 0.2), xytext=(0.4, 0.4), arrowprops=dict(arrowstyle="->", color='green', lw=2))
def plot_automated_curriculum(ax):
"""Training: Automated Curriculum Difficulty."""
t = np.arange(100)
difficulty = 1.0 / (1.0 + np.exp(-0.05 * (t - 50)))
performance = 0.8 / (1.0 + np.exp(-0.05 * (t - 40)))
ax.plot(t, difficulty, label="Task Difficulty", color='black')
ax.plot(t, performance, '--', label="Agent Performance", color='blue')
ax.set_title("Automated Curriculum", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_domain_randomization(ax):
"""Sim-to-Real: Domain Randomization parameter distribution."""
params = np.random.normal(1.0, 0.3, 1000)
ax.hist(params, bins=30, color='orange', alpha=0.6)
ax.set_title("Domain Randomization ($P(\\mu)$)", fontsize=12, fontweight='bold')
ax.set_xlabel("Friction / Mass Parameter")
def plot_rlhf_flow(ax):
"""Alignment: RL with Human Feedback (RLHF)."""
ax.axis('off')
ax.set_title("RLHF Flow Diagram", fontsize=12, fontweight='bold')
ax.text(0.1, 0.8, "Human Pref", bbox=dict(fc="salmon"), ha='center')
ax.text(0.5, 0.8, "Reward Model", bbox=dict(fc="gold"), ha='center')
ax.text(0.9, 0.8, "Fine-tuned Policy", bbox=dict(fc="lightgreen"), ha='center')
ax.annotate("", xy=(0.4, 0.8), xytext=(0.2, 0.8), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.8), xytext=(0.6, 0.8), arrowprops=dict(arrowstyle="->"))
ax.annotate("PPO Update", xy=(0.5, 0.5), xytext=(0.9, 0.7), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.3"))
def plot_successor_representations(ax):
"""Neuro-inspired RL: Successor Representation (SR) Matrix M."""
M = np.zeros((10, 10))
for i in range(10):
for j in range(10):
M[i, j] = 0.9**abs(i-j) # Decaying future occupancy
ax.imshow(M, cmap='viridis')
ax.set_title("Successor Representation $M$", fontsize=12, fontweight='bold')
ax.set_xlabel("State $j$")
ax.set_ylabel("State $i$")
def plot_maxent_irl_trajectories(ax):
"""IRL: MaxEnt IRL (Log-probability of trajectories)."""
ax.axis('off')
ax.set_title("MaxEnt IRL Distribution", fontsize=12, fontweight='bold')
for i in range(5):
alpha = 0.1 + i*0.2
ax.plot([0, 1], [0.5, 0.5+0.1*i], color='blue', alpha=alpha)
ax.plot([0, 1], [0.5, 0.5-0.1*i], color='blue', alpha=alpha)
ax.text(0.5, 0.8, r"$P(\tau) \propto \exp(R(\tau))$", ha='center', fontsize=12)
def plot_information_bottleneck(ax):
"""Theory: Information Bottleneck in RL."""
ax.axis('off')
ax.set_title("Information Bottleneck", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "S", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.text(0.5, 0.5, "Z", bbox=dict(boxstyle="circle", fc="gold"), ha='center')
ax.text(0.9, 0.5, "A", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.annotate("Compress", xy=(0.4, 0.5), xytext=(0.15, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("Extract", xy=(0.85, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
ax.text(0.5, 0.2, r"$\min I(S;Z)$ s.t. $I(Z;A) \geq I_c$", ha='center', fontsize=8)
def plot_es_population_distribution(ax):
"""Evolutionary Strategies: ES Population Distribution."""
np.random.seed(0)
mu = [0, 0]
points = np.random.randn(50, 2) * 0.5 + mu
ax.scatter(points[:, 0], points[:, 1], color='blue', alpha=0.4, label="Population")
ax.scatter(mu[0], mu[1], color='red', marker='x', label=r"$\mu$")
ax.annotate("Gradient Estimate", xy=(1.0, 1.0), xytext=(0, 0), arrowprops=dict(arrowstyle="->", color='red'))
ax.set_title("ES Population Update", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_cbf_safe_set(ax):
"""Safety RL: Control Barrier Function (CBF) Safe Set."""
ax.axis('off')
ax.set_title("CBF Safe Set Boundary", fontsize=12, fontweight='bold')
ax.add_patch(plt.Circle((0.5, 0.5), 0.35, fill=False, color='black', lw=2))
ax.text(0.5, 0.5, r"Safe Set $h(s) \geq 0$", ha='center')
ax.text(0.5, 0.1, "Unsafe $h(s) < 0$", ha='center', color='red')
ax.annotate("", xy=(0.8, 0.8), xytext=(0.6, 0.6), arrowprops=dict(arrowstyle="->", color='blue'))
ax.text(0.75, 0.65, r"$\nabla h$", color='blue')
def plot_count_based_exploration(ax):
"""Exploration: Count-based Heatmap N(s)."""
grid = np.random.poisson(2, (10, 10))
grid[0, 0] = 50; grid[9, 9] = 1
im = ax.imshow(grid, cmap='hot')
ax.set_title("Visit Counts $N(s)$", fontsize=12, fontweight='bold')
plt.colorbar(im, ax=ax, label="Visits")
def plot_thompson_sampling(ax):
"""Exploration: Thompson Sampling Posterior Distribution."""
x = np.linspace(0, 1, 100)
import scipy.stats as stats
y1 = stats.beta.pdf(x, 2, 5)
y2 = stats.beta.pdf(x, 10, 4)
ax.plot(x, y1, label="Action 1 (Uncertain)")
ax.plot(x, y2, label="Action 2 (Certain)")
ax.fill_between(x, y1, alpha=0.2)
ax.fill_between(x, y2, alpha=0.2)
ax.set_title("Thompson Sampling Posteriors", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_adversarial_rl_interaction(ax):
"""Multi-Agent: Adversarial RL (Protaganist vs Antagonist)."""
ax.axis('off')
ax.set_title("Adversarial RL Interaction", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, "Protaganist", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.5, "Antagonist", bbox=dict(fc="salmon"), ha='center')
ax.annotate("Force Distortion", xy=(0.35, 0.5), xytext=(0.65, 0.5), arrowprops=dict(arrowstyle="->", color='red'))
ax.annotate("Policy Update", xy=(0.5, 0.8), xytext=(0.5, 0.6), arrowprops=dict(arrowstyle="<-", connectionstyle="arc3,rad=-0.3"))
def plot_hierarchical_subgoals(ax):
"""Hierarchical RL: Subgoal Trajectory Waypoints."""
ax.set_title("Subgoal Trajectory", fontsize=12, fontweight='bold')
ax.plot([0, 1], [0, 1], 'k--', alpha=0.3)
ax.scatter([0, 0.3, 0.7, 1], [0, 0.4, 0.6, 1], c=['black', 'red', 'red', 'gold'], s=100)
ax.text(0.3, 0.45, "Subgoal 1", color='red', fontsize=8)
ax.text(0.7, 0.65, "Subgoal 2", color='red', fontsize=8)
ax.text(1, 1.1, "Final Goal", color='gold', fontweight='bold', ha='center')
def plot_offline_distribution_shift(ax):
"""Offline RL: Distribution Shift (Shift between D and pi)."""
x = np.linspace(-5, 5, 200)
d = np.exp(-(x+1)**2 / 2)
pi = np.exp(-(x-2)**2 / 1.5)
ax.plot(x, d, label=r"Offline Dataset $\mathcal{D}$", color='grey')
ax.plot(x, pi, label=r"Learned Policy $\pi$", color='blue')
ax.fill_between(x, 0, d, color='grey', alpha=0.1)
ax.fill_between(x, 0, pi, color='blue', alpha=0.1)
ax.set_title("Action Distribution Shift", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_rnd_curiosity(ax):
"""Exploration: Random Network Distillation (RND)."""
ax.axis('off')
ax.set_title("RND: Predictor vs Target", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "State $s$", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.text(0.3, 0.5, "Fixed Target Net", bbox=dict(fc="lightgrey"), ha='center', fontsize=8)
ax.text(0.7, 0.5, "Predictor Net", bbox=dict(fc="ivory"), ha='center', fontsize=8)
ax.text(0.5, 0.2, "MSE Error = Intrinsic Reward", ha='center', color='red', fontsize=9)
ax.annotate("", xy=(0.3, 0.6), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.7, 0.6), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->"))
def plot_bcq_offline_constraint(ax):
"""Offline RL: Batch-Constrained Q-learning (BCQ)."""
ax.axis('off')
ax.set_title("BCQ: Action Constraint", fontsize=12, fontweight='bold')
ax.add_patch(plt.Circle((0.5, 0.5), 0.35, fill=True, color='blue', alpha=0.1))
ax.text(0.5, 0.5, "Dataset Action\nDistribution", ha='center', color='blue')
ax.annotate("Constrained Action", xy=(0.4, 0.45), xytext=(0.2, 0.2), arrowprops=dict(arrowstyle="->", lw=2))
ax.text(0.5, 0.1, r"$\max Q(s, a)$ s.t. $a \in \mathcal{D}$", ha='center', fontsize=9)
def plot_pbt_evolution(ax):
"""Training: Population-Based Training (PBT)."""
ax.axis('off')
ax.set_title("Population-Based Training", fontsize=12, fontweight='bold')
for i in range(3):
ax.plot([0.1, 0.9], [0.8-i*0.3, 0.8-i*0.3], 'grey', alpha=0.3)
ax.text(0.1, 0.8-i*0.3, f"Agent {i+1}", ha='right')
ax.scatter([0.2, 0.5, 0.8], [0.8-i*0.3, 0.8-i*0.3, 0.8-i*0.3], color='blue')
ax.annotate("Exploit & Perturb", xy=(0.5, 0.2), xytext=(0.5, 0.5), arrowprops=dict(arrowstyle="->", color='red'))
def plot_recurrent_state_flow(ax):
"""Deep RL: Recurrent State Flow (DRQN/R2D2)."""
ax.axis('off')
ax.set_title("Recurrent $h_t$ Flow", fontsize=12, fontweight='bold')
for i in range(3):
ax.text(0.2+i*0.3, 0.5, f"Cell {i}", bbox=dict(fc="ivory"), ha='center')
if i < 2:
ax.annotate("", xy=(0.35+i*0.3, 0.5), xytext=(0.25+i*0.3, 0.5), arrowprops=dict(arrowstyle="->", color='blue'))
ax.text(0.3+i*0.3, 0.55, rf"$h_{i}$", color='blue', fontsize=8)
def plot_belief_state_pomdp(ax):
"""Theory: Belief State in POMDPs."""
x = np.linspace(0, 1, 100)
y = np.exp(-(x-0.3)**2 / 0.02) + 0.3*np.exp(-(x-0.8)**2 / 0.01)
ax.plot(x, y, color='purple')
ax.fill_between(x, y, alpha=0.2, color='purple')
ax.set_title(r"Belief State $b(s)$", fontsize=12, fontweight='bold')
ax.set_xlabel("State Space")
ax.set_ylabel("Probability")
def plot_pareto_front_morl(ax):
"""Multi-Objective RL: Pareto Front."""
np.random.seed(42)
x = np.random.rand(50)
y = np.random.rand(50)
ax.scatter(x, y, alpha=0.3, color='grey')
# Pareto front
px = np.sort(x)[-10:]
py = np.sort(y)[-10:][::-1]
ax.plot(px, py, 'r-o', label="Pareto Front")
ax.set_title("Multi-Objective Pareto Front", fontsize=12, fontweight='bold')
ax.set_xlabel("Reward A")
ax.set_ylabel("Reward B")
ax.legend(fontsize=8)
def plot_differential_value_average_reward(ax):
"""Theory: Differential Value (Average Reward RL)."""
t = np.arange(100)
v = np.sin(0.2*t) + 0.05*t # Increasing with oscillation
rho = 0.05 # average gain
ax.plot(t, v, label="Value $V(s_t)$")
ax.plot(t, rho*t, '--', label=r"Gain $\rho \cdot t$", color='red')
ax.set_title("Differential Value $v(s)$", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_distributed_rl_cluster(ax):
"""Infrastructure: Distributed RL Cluster (Ray/RLLib)."""
ax.axis('off')
ax.set_title("Distributed RL Cluster", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Learner / GPU", bbox=dict(boxstyle="round", fc="gold"), ha='center')
ax.text(0.5, 0.5, "Replay Buffer", bbox=dict(fc="lightgrey"), ha='center')
for i in range(3):
ax.text(0.2+i*0.3, 0.2, f"Worker {i+1}", bbox=dict(fc="ivory"), ha='center', fontsize=8)
ax.annotate("", xy=(0.5, 0.45), xytext=(0.2+i*0.3, 0.25), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.5, 0.75), xytext=(0.5, 0.55), arrowprops=dict(arrowstyle="->"))
def plot_neuroevolution_topology(ax):
"""Evolutionary RL: Topology Evolution (NEAT)."""
ax.axis('off')
ax.set_title("Neuroevolution Topology", fontsize=12, fontweight='bold')
nodes = [(0.2, 0.5), (0.5, 0.8), (0.5, 0.2), (0.8, 0.5)]
for p in nodes: ax.text(p[0], p[1], "", bbox=dict(boxstyle="circle", fc="white"))
# Edges
ax.annotate("", xy=nodes[1], xytext=nodes[0], arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=nodes[2], xytext=nodes[0], arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=nodes[3], xytext=nodes[1], arrowprops=dict(arrowstyle="->"))
# Mutation
ax.text(0.5, 0.5, "New Node", bbox=dict(boxstyle="circle", fc="yellow"), ha='center', fontsize=7)
ax.annotate("", xy=(0.5, 0.5), xytext=nodes[0], arrowprops=dict(arrowstyle="->", color='red', ls='--'))
def plot_ewc_elastic_weights(ax):
"""Continual RL: Elastic Weight Consolidation (EWC)."""
ax.axis('off')
ax.set_title("EWC Elastic Constraint", fontsize=12, fontweight='bold')
ax.add_patch(plt.Circle((0.3, 0.5), 0.2, color='blue', alpha=0.2, label="Task A"))
ax.add_patch(plt.Circle((0.7, 0.5), 0.2, color='red', alpha=0.2, label="Task B"))
ax.annotate("", xy=(0.5, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.3"))
ax.text(0.5, 0.7, "Spring Constraint", color='darkgreen', ha='center', fontsize=9)
def plot_successor_features(ax):
"""Theory: Successor Features (SF)."""
ax.axis('off')
ax.set_title(r"Successor Features $\psi$", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, r"Features $\phi(s)$", bbox=dict(fc="ivory"), ha='center')
ax.text(0.8, 0.5, r"SF $\psi(s)$", bbox=dict(fc="gold"), ha='center')
ax.annotate(r"$\sum \gamma^t \phi(s_t)$", xy=(0.7, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->", lw=2))
def plot_adversarial_state_noise(ax):
r"""Safety: Adversarial State Noise ($s + \delta$)."""
ax.axis('off')
ax.set_title("Adversarial Perturbation", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, "State $s$", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.5, 0.5, "+", fontsize=20, ha='center')
ax.text(0.8, 0.5, r"Noise $\delta$", bbox=dict(fc="salmon"), ha='center')
ax.annotate("Target: Wrong Action!", xy=(0.5, 0.2), xytext=(0.5, 0.4), arrowprops=dict(arrowstyle="->", color='red'))
def plot_behavioral_cloning_il(ax):
"""Imitation: Behavioral Cloning (BC)."""
ax.axis('off')
ax.set_title("Behavioral Cloning Flow", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "Expert Data\n$(s^*, a^*)$", bbox=dict(fc="gold"), ha='center', fontsize=8)
ax.text(0.5, 0.5, "Supervised\nLearning", bbox=dict(fc="ivory"), ha='center', fontsize=8)
ax.text(0.9, 0.5, r"Clone Policy\n$\pi_{BC}$", bbox=dict(fc="lightgrey"), ha='center', fontsize=8)
ax.annotate("", xy=(0.35, 0.5), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.75, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_relational_graph_state(ax):
"""Relational RL: Graph-based State Representation."""
ax.axis('off')
ax.set_title("Relational Graph State", fontsize=12, fontweight='bold')
pos = {1: (0.3, 0.7), 2: (0.7, 0.7), 3: (0.5, 0.3)}
for k, p in pos.items():
ax.text(p[0], p[1], f"Obj {k}", bbox=dict(boxstyle="round", fc="lightblue"), ha='center')
edges = [(1, 2), (2, 3), (3, 1)]
for u, v in edges:
ax.annotate("relation", xy=pos[v], xytext=pos[u], arrowprops=dict(arrowstyle="-", color='grey', ls=':'), ha='center')
def plot_quantum_rl_circuit(ax):
"""Quantum RL: Parameterized Quantum Circuit (PQC) Policy."""
ax.axis('off')
ax.set_title("Quantum Policy (PQC)", fontsize=12, fontweight='bold')
ax.plot([0.1, 0.9], [0.7, 0.7], 'k', lw=1)
ax.plot([0.1, 0.9], [0.3, 0.3], 'k', lw=1)
ax.text(0.2, 0.7, r"$|0\rangle$", ha='right')
ax.text(0.2, 0.3, r"$|0\rangle$", ha='right')
# Gates
ax.text(0.4, 0.7, r"$R_y(\theta)$", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.6, 0.5, "CNOT", bbox=dict(fc="gold"), ha='center')
ax.plot([0.6, 0.6], [0.3, 0.7], 'k-o')
ax.text(0.8, 0.7, r"$\mathcal{M}$", bbox=dict(boxstyle="square", fc="lightgrey"), ha='center')
def plot_symbolic_expression_tree(ax):
"""Symbolic RL: Policy as a Mathematical Expression Tree."""
ax.axis('off')
ax.set_title("Symbolic Policy Tree", fontsize=12, fontweight='bold')
nodes = {0:(0.5, 0.8, "+"), 1:(0.3, 0.5, "*"), 2:(0.7, 0.5, "exp"), 3:(0.2, 0.2, "s"), 4:(0.4, 0.2, "2.5"), 5:(0.7, 0.2, "s")}
edges = [(0,1), (0,2), (1,3), (1,4), (2,5)]
for k, (x, y, t) in nodes.items():
ax.text(x, y, t, bbox=dict(boxstyle="circle", fc="ivory"), ha='center')
for u, v in edges:
ax.annotate("", xy=nodes[v][:2], xytext=nodes[u][:2], arrowprops=dict(arrowstyle="-"))
def plot_differentiable_physics_gradient(ax):
"""Control: Differentiable Physics Gradient Flow."""
ax.axis('off')
ax.set_title("Diff-Physics Gradient", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "Policy", bbox=dict(fc="ivory"), ha='center')
ax.text(0.5, 0.5, "Diff-Sim\nDynamics", bbox=dict(fc="gold", boxstyle="round"), ha='center')
ax.text(0.9, 0.5, "Loss", bbox=dict(fc="salmon"), ha='center')
# Forward
ax.annotate("", xy=(0.35, 0.5), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.75, 0.5), xytext=(0.65, 0.5), arrowprops=dict(arrowstyle="->"))
# Backward
ax.annotate("$\nabla$ gradient", xy=(0.15, 0.4), xytext=(0.85, 0.4), arrowprops=dict(arrowstyle="->", color='red', connectionstyle="arc3,rad=-0.2"))
def plot_marl_communication_channel(ax):
"""MARL: Communication Channel (CommNet/DIAL)."""
ax.axis('off')
ax.set_title("Multi-Agent Comm Channel", fontsize=12, fontweight='bold')
ax.text(0.2, 0.8, "Agent A", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.8, "Agent B", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.5, 0.2, "Task Goal", bbox=dict(fc="lightgrey"), ha='center')
# Message
ax.annotate("Message $m_{A \to B}$", xy=(0.7, 0.8), xytext=(0.3, 0.8), arrowprops=dict(arrowstyle="->", ls="--", color='purple'))
ax.annotate("", xy=(0.2, 0.45), xytext=(0.2, 0.7), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.45), xytext=(0.8, 0.7), arrowprops=dict(arrowstyle="->"))
def plot_lagrangian_multiplier_landscape(ax):
"""Safety: Lagrangian Constraint Optimization."""
x = np.linspace(-2, 2, 100); y = np.linspace(-2, 2, 100)
X, Y = np.meshgrid(x, y); Z = X**2 + Y**2
ax.contour(X, Y, Z, levels=10, alpha=0.3)
ax.axvline(x=0.5, color='red', ls='--', label=r"Constraint $g(s) \leq 0$")
ax.scatter([1.0], [1.0], color='blue', label="Unconstrained Min")
ax.scatter([0.5], [0.0], color='green', label="Constrained Min")
ax.set_title("Lagrangian Constrained Opt", fontsize=12, fontweight='bold')
ax.legend(fontsize=7, loc='upper left')
def plot_maxq_task_hierarchy(ax):
"""HRL: MAXQ Recursive Task Decomposition."""
ax.axis('off')
ax.set_title("MAXQ Task Hierarchy", fontsize=12, fontweight='bold')
# Levels
ax.text(0.5, 0.9, "Root Task", bbox=dict(fc="gold"), ha='center')
ax.text(0.3, 0.6, "GetFuel", bbox=dict(fc="ivory"), ha='center')
ax.text(0.7, 0.6, "DeliverCargo", bbox=dict(fc="ivory"), ha='center')
ax.text(0.3, 0.3, "Navigate", bbox=dict(fc="lightgrey"), ha='center', fontsize=8)
ax.text(0.7, 0.3, "Unload", bbox=dict(fc="lightgrey"), ha='center', fontsize=8)
# Recursion
ax.annotate("", xy=(0.3, 0.65), xytext=(0.45, 0.85), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.7, 0.65), xytext=(0.55, 0.85), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.3, 0.35), xytext=(0.3, 0.55), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.7, 0.35), xytext=(0.7, 0.55), arrowprops=dict(arrowstyle="->"))
def plot_react_cycle_thinking(ax):
"""Agentic LLM: ReAct Loop (Thought-Action-Observation)."""
ax.axis('off')
ax.set_title(r"ReAct Cycle: $T \to A \to O$", fontsize=12, fontweight='bold')
steps = ["Thought", "Action", "Observation"]
colors = ["ivory", "lightblue", "lightgreen"]
for i, s in enumerate(steps):
angle = 2 * np.pi * i / 3
x, y = 0.5 + 0.3*np.cos(angle), 0.5 + 0.3*np.sin(angle)
ax.text(x, y, s, bbox=dict(boxstyle="round", fc=colors[i]), ha='center')
# Loop arrows
ax.annotate("", xy=(0.2, 0.5), xytext=(0.5, 0.8), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.3"))
ax.annotate("", xy=(0.5, 0.2), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.3"))
ax.annotate("", xy=(0.8, 0.5), xytext=(0.5, 0.2), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.3"))
def plot_synaptic_plasticity_rl(ax):
"""Bio-inspired: Synaptic Plasticity (Hebbian RL/STDP)."""
ax.axis('off')
ax.set_title("Synaptic Plasticity RL", fontsize=12, fontweight='bold')
ax.text(0.3, 0.5, "Pre-neuron", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.text(0.7, 0.5, "Post-neuron", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.plot([0.35, 0.65], [0.5, 0.5], 'k', lw=4, label="Synapse $w$")
ax.text(0.5, 0.6, r"$\Delta w \propto \delta \cdot x_{pre} \cdot x_{post}$", color='red', ha='center', fontsize=10)
ax.annotate(r"TD Error $\delta$", xy=(0.5, 0.5), xytext=(0.5, 0.8), arrowprops=dict(arrowstyle="->", color='red'))
def plot_guided_policy_search_gps(ax):
"""Control: Guided Policy Search (GPS)."""
ax.axis('off')
ax.set_title("Guided Policy Search (GPS)", fontsize=12, fontweight='bold')
ax.plot([0.1, 0.9], [0.7, 0.8], 'b', label=r"Optimal Trajectory $\tau^*$")
ax.plot([0.1, 0.9], [0.6, 0.6], 'r--', label=r"Current Policy $\pi_\theta$")
ax.annotate("Minimize KL", xy=(0.5, 0.6), xytext=(0.5, 0.72), arrowprops=dict(arrowstyle="<->"))
ax.legend(fontsize=8, loc='lower right')
def plot_sim2real_jitter_latency(ax):
"""Robotics: Sim-to-Real Jitter & Latency Analysis."""
t = np.linspace(0, 10, 100)
ideal = np.sin(t)
jitter = ideal + 0.2*np.random.randn(100)
ax.plot(t, ideal, 'g', alpha=0.5, label="Simulator (Ideal)")
ax.step(t + 0.3, jitter, 'r', label="Real Robot (Latency+Jitter)")
ax.set_title("Sim-to-Real Temporal Mismatch", fontsize=12, fontweight='bold')
ax.set_xlabel("Time (s)")
ax.legend(fontsize=8)
def plot_ddpg_deterministic_gradient(ax):
"""Deterministic Policy Gradient (DDPG)."""
ax.axis('off')
ax.set_title("DDPG Gradient Flow", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, r"$\pi_\theta(s)$", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.5, r"$Q_w(s, a)$", bbox=dict(fc="gold"), ha='center')
ax.annotate(r"$\nabla_\theta J \approx \nabla_a Q(s,a)|_{a=\pi(s)} \nabla_\theta \pi_\theta(s)$", xy=(0.5, 0.2), xytext=(0.5, 0.4), arrowprops=dict(arrowstyle="->", color='red'), ha='center', fontsize=9)
ax.annotate("action", xy=(0.7, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_dreamer_latent_rollout(ax):
"""Model-Based RL: Dreamer Latent imagination."""
ax.axis('off')
ax.set_title("Dreamer Latent imagination", fontsize=12, fontweight='bold')
for i in range(3):
ax.text(0.2 + i*0.3, 0.5, f"$z_{i}$", bbox=dict(boxstyle="circle", fc="lightgreen"), ha='center')
if i < 2:
ax.annotate("", xy=(0.35 + i*0.3, 0.5), xytext=(0.25 + i*0.3, 0.5), arrowprops=dict(arrowstyle="->"))
ax.text(0.3 + i*0.3, 0.7, r"$\hat{a}$", ha='center')
ax.text(0.5, 0.2, r"Policy $\pi(z)$ learned in latent space", fontsize=9, ha='center')
def plot_unreal_auxiliary_tasks(ax):
"""Deep RL: UNREAL Architecture (Auxiliary Tasks)."""
ax.axis('off')
ax.set_title("UNREAL Auxiliary Tasks", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Base Agent (A3C)", bbox=dict(fc="ivory"), ha='center')
tasks = ["Pixel Control", "Value Replay", "Reward Prediction"]
for i, t in enumerate(tasks):
ax.text(0.2 + i*0.3, 0.4, t, bbox=dict(fc="orange", alpha=0.3), ha='center', fontsize=8)
ax.annotate("", xy=(0.2+i*0.3, 0.5), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->", ls=':'))
ax.text(0.5, 0.1, "Shared Representation Learning", fontweight='bold', ha='center', fontsize=9)
def plot_iql_expectile_loss(ax):
"""Offline RL: Implicit Q-Learning (IQL) Expectile."""
x = np.linspace(-2, 2, 100)
tau = 0.8
loss = np.where(x > 0, tau * x**2, (1-tau) * x**2)
ax.plot(x, loss, color='purple', lw=2)
ax.set_title(r"IQL Expectile Loss $L_\tau$", fontsize=12, fontweight='bold')
ax.axvline(0, color='black', alpha=0.3)
ax.text(1, 1, r"$\tau=0.8$", color='purple')
def plot_prioritized_sweeping(ax):
"""Model-Based: Prioritized Sweeping."""
ax.axis('off')
ax.set_title("Prioritized Sweeping", fontsize=12, fontweight='bold')
ax.text(0.2, 0.8, "State $s$", bbox=dict(fc="white"), ha='center')
ax.text(0.8, 0.2, "Priority Queue", bbox=dict(boxstyle="sawtooth", fc="gold"), ha='center')
ax.annotate(r"TD Error $|\delta|$", xy=(0.7, 0.3), xytext=(0.3, 0.7), arrowprops=dict(arrowstyle="->", color='red'))
ax.text(0.5, 0.5, "Update most affected states first", rotation=-35, fontsize=8)
def plot_dagger_expert_loop(ax):
"""Imitation: DAgger (Dataset Aggregation)."""
ax.axis('off')
ax.set_title("DAgger Expert Loop", fontsize=12, fontweight='bold')
ax.text(0.2, 0.7, r"Learner $\pi_\theta$", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.7, r"Expert $\pi^*$", bbox=dict(fc="gold"), ha='center')
ax.text(0.5, 0.3, r"Dataset $\mathcal{D}$", bbox=dict(boxstyle="round", fc="ivory"), ha='center')
ax.annotate("Collect", xy=(0.5, 0.4), xytext=(0.2, 0.6), arrowprops=dict(arrowstyle="->"))
ax.annotate("Relabel", xy=(0.8, 0.6), xytext=(0.5, 0.4), arrowprops=dict(arrowstyle="<-"))
ax.annotate("Train", xy=(0.25, 0.65), xytext=(0.4, 0.35), arrowprops=dict(arrowstyle="->", color='blue'))
def plot_spr_self_prediction(ax):
"""Deep RL: Self-Predictive Representations (SPR)."""
ax.axis('off')
ax.set_title("SPR: Self-Prediction", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, "Encoder", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.8, 0.7, "Target Latent", bbox=dict(fc="gold", alpha=0.3), ha='center')
ax.text(0.8, 0.3, "Predicted Latent", bbox=dict(fc="lightblue"), ha='center')
ax.annotate("", xy=(0.7, 0.7), xytext=(0.3, 0.55), arrowprops=dict(arrowstyle="->", ls='--'))
ax.annotate("", xy=(0.7, 0.3), xytext=(0.3, 0.45), arrowprops=dict(arrowstyle="->"))
ax.text(0.9, 0.5, "Consistency Loss", rotation=90, color='red', fontsize=8)
def plot_joint_action_space(ax):
"""MARL: Joint Action Space $A_1 \times A_2$."""
ax.set_title(r"Joint Action Space $A_1 \times A_2$", fontsize=12, fontweight='bold')
for x in range(3):
for y in range(3):
ax.scatter(x, y, color='blue', alpha=0.5)
ax.text(x, y+0.1, f"($a^k_{x}, a^j_{y}$)", fontsize=7, ha='center')
ax.set_xlabel("Agent 1 Actions")
ax.set_ylabel("Agent 2 Actions")
ax.set_xticks([0,1,2]); ax.set_yticks([0,1,2])
def plot_dec_pomdp_graph(ax):
"""MARL: Dec-POMDP Formal Model."""
ax.axis('off')
ax.set_title("Dec-POMDP Model", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Global State $s$", bbox=dict(fc="ivory"), ha='center')
ax.text(0.2, 0.4, "Obs $o_1$", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.4, "Obs $o_2$", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.5, 0.1, "Joint Reward $r$", bbox=dict(fc="gold"), ha='center')
ax.annotate("", xy=(0.2, 0.5), xytext=(0.45, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.5), xytext=(0.55, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.45, 0.15), xytext=(0.2, 0.35), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.55, 0.15), xytext=(0.8, 0.35), arrowprops=dict(arrowstyle="->"))
def plot_bisimulation_metric(ax):
"""Theory: State Bisimulation Metric."""
ax.axis('off')
ax.set_title("Bisimulation Metric", fontsize=12, fontweight='bold')
ax.text(0.3, 0.6, "$s_1$", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.text(0.7, 0.6, "$s_2$", bbox=dict(boxstyle="circle", fc="white"), ha='center')
ax.annotate("$d(s_1, s_2)$", xy=(0.65, 0.6), xytext=(0.35, 0.6), arrowprops=dict(arrowstyle="<->", color='purple'))
ax.text(0.5, 0.2, "States are equivalent if rewards and\ntransitions to equivalent states match", ha='center', fontsize=8)
def plot_reward_shaping_phi(ax):
"""Theory: Potential-Based Reward Shaping."""
ax.axis('off')
ax.set_title("Potential-Based Reward Shaping", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, "$s$", bbox=dict(fc="ivory"), ha='center')
ax.text(0.8, 0.5, "$s'$", bbox=dict(fc="ivory"), ha='center')
ax.annotate("", xy=(0.7, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->"))
ax.text(0.5, 0.7, r"$\gamma \Phi(s') - \Phi(s)$", color='blue', ha='center')
ax.text(0.5, 0.3, "Added to environmental reward $r$", fontsize=8, ha='center')
def plot_transfer_rl_source_target(ax):
"""Training: Transfer RL (Source to Target)."""
ax.axis('off')
ax.set_title("Transfer RL: Source to Target", fontsize=12, fontweight='bold')
ax.text(0.3, 0.7, r"Source Task $\mathcal{T}_A$", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.7, 0.3, r"Target Task $\mathcal{T}_B$", bbox=dict(fc="lightgreen"), ha='center')
ax.annotate("Knowledge Transfer\n(Weights/Expert Data)", xy=(0.6, 0.4), xytext=(0.4, 0.6), arrowprops=dict(arrowstyle="->", lw=2, color='orange'), ha='center')
def plot_multi_task_backbone(ax):
"""Deep RL: Multi-Task Architecture."""
ax.axis('off')
ax.set_title("Multi-Task Backbone Arch", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "State Input", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.5, 0.5, "Shared Backbone", bbox=dict(fc="cornflowerblue"), ha='center')
ax.text(0.2, 0.2, "Task 1 Head", bbox=dict(fc="orange", alpha=0.5), ha='center')
ax.text(0.8, 0.2, "Task N Head", bbox=dict(fc="orange", alpha=0.5), ha='center')
ax.annotate("", xy=(0.5, 0.6), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="<-"))
ax.annotate("", xy=(0.25, 0.3), xytext=(0.45, 0.45), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.75, 0.3), xytext=(0.55, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_contextual_bandit_pipeline(ax):
"""Bandits: Contextual Bandit Pipeline."""
ax.axis('off')
ax.set_title("Contextual Bandit Pipeline", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, r"Context $x$", bbox=dict(fc="ivory"), ha='center')
ax.text(0.5, 0.5, r"Policy $\pi(a|x)$", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.9, 0.5, r"Reward $r$", bbox=dict(fc="gold"), ha='center')
ax.annotate("", xy=(0.4, 0.5), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_regret_bounds_theoretical(ax):
"""Theory: Regret Upper/Lower Bounds."""
t = np.linspace(1, 100, 100)
ax.plot(t, np.sqrt(t), label=r"Upper Bound $O(\sqrt{T})$", color='red')
ax.plot(t, np.log(t), label=r"Optimal Regret $O(\log T)$", color='blue')
ax.set_title("Theoretical Regret Bounds", fontsize=12, fontweight='bold')
ax.set_xlabel("Time $T$")
ax.set_ylabel("Cumulative Regret")
ax.legend()
def plot_soft_q_heatmap(ax):
"""Value-based: Soft Q-Learning Heatmap."""
data = np.random.randn(10, 10)
soft_q = np.exp(data) / np.sum(np.exp(data))
im = ax.imshow(soft_q, cmap='hot')
plt.colorbar(im, ax=ax)
ax.set_title("Soft Q Boltzmann Probabilities", fontsize=12, fontweight='bold')
def plot_ad_rl_pipeline(ax):
"""Robotics: Autonomous Driving RL Pipeline."""
ax.axis('off')
ax.set_title("Autonomous Driving RL Pipeline", fontsize=12, fontweight='bold')
modules = ["Sensors", "Perception (CNN)", "RL Policy", "Actuators"]
for i, m in enumerate(modules):
ax.text(0.25 + (i%2)*0.5, 0.7 - (i//2)*0.5, m, bbox=dict(fc="ivory"), ha='center')
ax.annotate("", xy=(0.7, 0.7), xytext=(0.3, 0.7), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.75, 0.35), xytext=(0.75, 0.6), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.3, 0.2), xytext=(0.7, 0.2), arrowprops=dict(arrowstyle="<-"))
def plot_action_grad_comparison(ax):
"""Policy: Stochastic vs Deterministic Gradients."""
ax.axis('off')
ax.set_title("Action Gradient Types", fontsize=12, fontweight='bold')
ax.text(0.5, 0.7, r"Stochastic: $\nabla \log \pi(a|s) Q(s,a)$", color='blue', ha='center')
ax.text(0.5, 0.3, r"Deterministic: $\nabla_a Q(s,a) \nabla \pi(s)$", color='red', ha='center')
ax.text(0.5, 0.5, "vs", fontweight='bold', ha='center')
def plot_irl_feature_matching(ax):
"""IRL: Feature Expectation Matching."""
ax.axis('off')
ax.set_title("IRL: Feature Expectation Matching", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, r"Expert $\mu(\pi^*)$", bbox=dict(fc="gold"), ha='center')
ax.text(0.8, 0.5, r"Learner $\mu(\pi)$", bbox=dict(fc="lightblue"), ha='center')
ax.annotate(r"$||\mu(\pi^*) - \mu(\pi)||_2 \leq \epsilon$", xy=(0.5, 0.2), ha='center', color='red')
ax.annotate("", xy=(0.65, 0.5), xytext=(0.35, 0.5), arrowprops=dict(arrowstyle="<->", ls='--'))
def plot_apprenticeship_learning_loop(ax):
"""Imitation: Apprenticeship Learning Loop."""
ax.axis('off')
ax.set_title("Apprenticeship Learning Loop", fontsize=12, fontweight='bold')
nodes = ["Expert Demos", "Reward Learning", "Agent Policy", "Environment"]
for i, n in enumerate(nodes):
ax.text(0.5, 0.9 - i*0.25, n, bbox=dict(fc="ivory"), ha='center')
if i < 3: ax.annotate("", xy=(0.5, 0.7 - i*0.25), xytext=(0.5, 0.8 - i*0.25), arrowprops=dict(arrowstyle="->"))
ax.annotate("feedback", xy=(0.3, 0.9), xytext=(0.3, 0.15), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.5"))
def plot_active_inference_loop(ax):
"""Theoretical: Active Inference / Free Energy Loop."""
ax.axis('off')
ax.set_title("Active Inference Loop", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Internal Model (Generative)", bbox=dict(fc="cornflowerblue", alpha=0.3), ha='center')
ax.text(0.5, 0.2, "External Environment", bbox=dict(fc="lightgrey"), ha='center')
ax.annotate("Action (Active Charge)", xy=(0.8, 0.25), xytext=(0.8, 0.75), arrowprops=dict(arrowstyle="<-", color='red'))
ax.annotate("Perception (Surprise Min)", xy=(0.2, 0.75), xytext=(0.2, 0.25), arrowprops=dict(arrowstyle="<-", color='blue'))
ax.text(0.5, 0.5, r"$\min F = D_{KL}(q||p)$", ha='center', fontweight='bold')
def plot_bellman_residual_landscape(ax):
"""Theory: Bellman Residual Landscape."""
X, Y = np.meshgrid(np.linspace(-2, 2, 20), np.linspace(-2, 2, 20))
Z = (X**2 + Y**2) + 0.5 * np.sin(3*X) # Non-convex loss
ax.contourf(X, Y, Z, cmap='magma')
ax.set_title("Bellman Residual Landscape", fontsize=12, fontweight='bold')
def plot_plan_to_explore_map(ax):
"""MBRL: Plan-to-Explore Uncertainty Map."""
data = np.random.rand(10, 10)
im = ax.imshow(data, cmap='YlOrRd')
ax.set_title("Plan-to-Explore Uncertainty", fontsize=12, fontweight='bold')
ax.text(2, 2, "Explored", color='black', fontsize=8)
ax.text(7, 7, "Unknown", color='red', fontweight='bold', fontsize=8)
def plot_robust_rl_uncertainty_set(ax):
"""Safety: Robust RL Uncertainty Set."""
ax.axis('off')
ax.set_title("Robust RL Uncertainty Set", fontsize=12, fontweight='bold')
circle = plt.Circle((0.5, 0.5), 0.3, color='blue', alpha=0.1)
ax.add_patch(circle)
ax.text(0.5, 0.5, r"$\mathcal{P}$", fontsize=20, ha='center')
ax.text(0.5, 0.1, r"$\min_\pi \max_{P \in \mathcal{P}} \mathbb{E}[R]$", ha='center', fontsize=12)
ax.annotate("Nominal Model", xy=(0.5, 0.5), xytext=(0.2, 0.8), arrowprops=dict(arrowstyle="->"))
def plot_hpo_bayesian_opt_cycle(ax):
"""Training: HPO Bayesian Optimization Cycle."""
ax.axis('off')
ax.set_title("HPO Bayesian Opt Cycle", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Surrogate Model (GP)", bbox=dict(fc="ivory"), ha='center')
ax.text(0.5, 0.2, "RL Objective Function", bbox=dict(fc="ivory"), ha='center')
ax.annotate("Select Hyperparams", xy=(0.7, 0.3), xytext=(0.7, 0.7), arrowprops=dict(arrowstyle="<-"))
ax.annotate("Update Model", xy=(0.3, 0.7), xytext=(0.3, 0.3), arrowprops=dict(arrowstyle="<-"))
def plot_slate_rl_reco_pipeline(ax):
"""Applied: Slate RL / Recommendation Pipeline."""
ax.axis('off')
ax.set_title("Slate RL Recommendation", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "User State", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.5, 0.5, "Slate Policy", bbox=dict(fc="gold"), ha='center')
ax.text(0.9, 0.5, "Action (Items)", bbox=dict(fc="lightgreen"), ha='center')
ax.annotate("", xy=(0.4, 0.5), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
ax.text(0.5, 0.2, "Combinatorial Action Space", fontsize=8, ha='center')
def plot_game_theory_fictitious_play(ax):
"""Multi-Agent: Fictitious Play Interaction."""
ax.axis('off')
ax.set_title("Fictitious Play Interaction", fontsize=12, fontweight='bold')
ax.text(0.2, 0.7, "Agent A (Best Response)", bbox=dict(fc="white"), ha='center')
ax.text(0.8, 0.7, "Agent B (Best Response)", bbox=dict(fc="white"), ha='center')
ax.text(0.5, 0.3, r"Empirical Frequency $\hat{\pi}$", bbox=dict(fc="ivory"), ha='center')
ax.annotate("", xy=(0.45, 0.4), xytext=(0.25, 0.6), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.55, 0.4), xytext=(0.75, 0.6), arrowprops=dict(arrowstyle="->"))
def plot_universal_rl_framework(ax):
"""Conceptual: Universal RL Framework Diagram."""
ax.axis('off')
ax.set_title("Universal RL Framework", fontsize=12, fontweight='bold')
rect = plt.Rectangle((0.15, 0.15), 0.7, 0.7, fill=False, ls='--')
ax.add_patch(rect)
ax.text(0.5, 0.5, "RL Agent\n(Algorithm + Model + Exp)", ha='center', fontweight='bold')
ax.text(0.5, 0.9, "Problem Context", ha='center', color='grey')
ax.text(0.5, 0.1, "Reward / Evaluation", ha='center', color='grey')
def plot_offline_density_ratio(ax):
"""Offline RL: Density Ratio Estimation $w(s,a)$."""
x = np.linspace(-3, 3, 100)
pi_e = norm.pdf(x, 0, 1)
pi_b = norm.pdf(x, 1, 1.5)
ax.plot(x, pi_e, label=r"Policy $\pi_e$")
ax.plot(x, pi_b, label=r"Behavior $\pi_b$", ls='--')
ax.fill_between(x, pi_e / (pi_b + 1e-5), alpha=0.1, label="Ratio $w$")
ax.set_title(r"Offline Density Ratio $w(s,a)$", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_continual_task_interference(ax):
"""Continual RL: Task Interference Heatmap."""
data = np.eye(5) + 0.1 * np.random.randn(5, 5)
data[1,0] = -0.5 # Interference
im = ax.imshow(data, cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar(im, ax=ax)
ax.set_title("Continual Task Interference", fontsize=12, fontweight='bold')
ax.set_xlabel("Previously Learned Tasks"); ax.set_ylabel("Current Task")
def plot_lyapunov_safe_set(ax):
"""Safety: Lyapunov Stability Set."""
ax.set_title("Lyapunov Safe Set", fontsize=12, fontweight='bold')
theta = np.linspace(0, 2*np.pi, 100)
r = 1 + 0.2 * np.sin(4*theta)
ax.fill(r * np.cos(theta), r * np.sin(theta), color='green', alpha=0.1, label="Invariant Set")
ax.plot(r * np.cos(theta), r * np.sin(theta), color='green')
ax.quiver(0.5, 0.5, -0.4, -0.4, color='red', scale=5, label="Energy Decrease")
ax.legend(fontsize=8); ax.set_xlim(-1.5, 1.5); ax.set_ylim(-1.5, 1.5)
def plot_molecular_rl_atoms(ax):
"""Applied: Molecular RL (Atoms)."""
ax.set_title("Molecular RL (Atom State)", fontsize=12, fontweight='bold')
for _ in range(5):
pos = np.random.rand(2)
circle = plt.Circle(pos, 0.05, color='blue', alpha=0.7)
ax.add_patch(circle)
ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis('off')
ax.text(0.5, -0.05, "States = Atomic Coordinates", ha='center', fontsize=8)
def plot_moe_multi_task_arch(ax):
"""Architecture: MoE for Multi-task."""
ax.axis('off')
ax.set_title("MoE Multi-task Architecture", fontsize=12, fontweight='bold')
ax.text(0.5, 0.9, "Gating Network", bbox=dict(fc="orange"), ha='center')
for i in range(3):
ax.text(0.2 + i*0.3, 0.5, f"Expert {i+1}", bbox=dict(fc="ivory"), ha='center')
ax.annotate("", xy=(0.2 + i*0.3, 0.6), xytext=(0.5, 0.8), arrowprops=dict(arrowstyle="->"))
ax.text(0.5, 0.2, "Joint Output", bbox=dict(fc="lightgrey"), ha='center')
def plot_cma_es_distribution(ax):
"""Direct Policy Search: CMA-ES Distribution."""
x = np.random.randn(200, 2)
ax.scatter(x[:,0], x[:,1], alpha=0.3, color='grey')
circle = plt.Circle((0, 0), 1.5, fill=False, color='red', lw=2, label="Sample Ellipsoid")
ax.add_patch(circle)
ax.set_title("CMA-ES Policy Search", fontsize=12, fontweight='bold')
ax.legend(fontsize=8)
def plot_elo_rating_preference(ax):
"""Alignment: Elo Rating Preference Plot."""
x = np.linspace(0, 10, 10)
y = 1000 + 100 * np.log(x + 1) + 20 * np.random.randn(10)
ax.step(x, y, color='purple', where='post')
ax.set_title("Policy Elo Rating vs Experience", fontsize=12, fontweight='bold')
ax.set_xlabel("Relative Training Time"); ax.set_ylabel("Elo Rating")
def plot_shap_lime_attribution(ax):
"""Explainable RL: SHAP/LIME Attribution."""
ax.set_title("Action Attribution (SHAP)", fontsize=12, fontweight='bold')
feats = ["Dist to Goal", "Velocity", "Agent Pitch", "Sensor 4"]
vals = [0.6, -0.3, 0.1, 0.05]
colors = ['green' if v > 0 else 'red' for v in vals]
ax.barh(feats, vals, color=colors)
ax.set_xlabel("Contribution to Action probability")
def plot_pearl_context_encoder(ax):
"""Meta-RL: Context Encoder (PEARL)."""
ax.axis('off')
ax.set_title("PEARL Context Encoder", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, "Experience batch\n(s, a, r, s')", bbox=dict(fc="ivory"), ha='center', fontsize=8)
ax.text(0.5, 0.5, r"Encoder $q_\phi(z|...)$", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.5, "Latent Task $z$", bbox=dict(boxstyle="circle", fc="lightgreen"), ha='center')
ax.annotate("", xy=(0.4, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.7, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_healthcare_rl_pipeline(ax):
"""Applied: Healthcare / Medical Therapy."""
ax.axis('off')
ax.set_title("Medical RL Therapy Pipeline", fontsize=12, fontweight='bold')
blocks = ["Patient History (EHR)", "State Estimator", "Policy (Action = Dose)", "Clinical Outcome"]
for i, b in enumerate(blocks):
ax.text(0.5, 0.9 - i*0.25, b, bbox=dict(fc="pink", alpha=0.3), ha='center')
if i < 3: ax.annotate("", xy=(0.5, 0.7 - i*0.25), xytext=(0.5, 0.8 - i*0.25), arrowprops=dict(arrowstyle="->"))
def plot_supply_chain_rl(ax):
"""Applied: Supply Chain / Inventory RL."""
ax.axis('off')
ax.set_title("Supply Chain RL Pipeline", fontsize=12, fontweight='bold')
G = nx.DiGraph()
nodes = ["Factory", "Warehouse", "Retailer", "Customer"]
for i, n in enumerate(nodes):
ax.text(0.1 + i*0.27, 0.5, n, bbox=dict(boxstyle="round", fc="ivory"), ha='center')
for i in range(3):
ax.annotate("", xy=(0.28 + i*0.27, 0.5), xytext=(0.2 + i*0.27, 0.5), arrowprops=dict(arrowstyle="->"))
ax.text(0.5, 0.2, "State = Stock Levels, Action = Orders", ha='center', fontsize=8)
def plot_sysid_safe_loop(ax):
"""Robotics: Sim-to-Real SysID Loop."""
ax.axis('off')
ax.set_title("Sim-to-Real SysID Loop", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Physical System", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.5, 0.5, "System ID Estimator", bbox=dict(fc="orange", alpha=0.5), ha='center')
ax.text(0.5, 0.2, "Simulation Model", bbox=dict(fc="lightblue"), ha='center')
ax.annotate("Observables", xy=(0.4, 0.6), xytext=(0.4, 0.75), arrowprops=dict(arrowstyle="<-"))
ax.annotate("Update Parameters", xy=(0.6, 0.3), xytext=(0.6, 0.45), arrowprops=dict(arrowstyle="<-"))
def plot_transformer_world_model(ax):
"""Architecture: Transformer World Model."""
ax.axis('off')
ax.set_title("Transformer World Model", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Sequence of $(s, a, r)$", bbox=dict(fc="ivory"), ha='center')
ax.text(0.5, 0.5, "Self-Attention Layers", bbox=dict(fc="purple", alpha=0.3), ha='center')
ax.text(0.5, 0.2, "Predicted $s_{t+1}, r_{t+1}$", bbox=dict(fc="lightgreen"), ha='center')
ax.annotate("", xy=(0.5, 0.6), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.5, 0.3), xytext=(0.5, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_network_rl(ax):
"""Applied: RL for Networking."""
ax.axis('off')
ax.set_title("Network Traffic RL", fontsize=12, fontweight='bold')
G = nx.Graph()
G.add_edges_from([(0,1), (1,2), (2,3), (3,0)])
pos = nx.spring_layout(G)
nx.draw(G, pos, ax=ax, node_color='lightblue', with_labels=False)
ax.annotate("RL Router", xy=(pos[1][0], pos[1][1]), xytext=(pos[1][0], pos[1][1]+0.2), arrowprops=dict(arrowstyle="->"))
def plot_rlhf_ppo_ref(ax):
"""Training: RLHF PPO with Reference Policy."""
ax.axis('off')
ax.set_title("RLHF: PPO with Reference Policy", fontsize=12, fontweight='bold')
ax.text(0.3, 0.8, r"Active Policy $\pi_\theta$", bbox=dict(fc="ivory"), ha='center')
ax.text(0.7, 0.8, r"Ref Policy $\pi_{ref}$", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.5, 0.5, "KL Penalty", bbox=dict(boxstyle="sawtooth", fc="red", alpha=0.3), ha='center')
ax.text(0.5, 0.2, "Reward Model $r(s,a)$", bbox=dict(fc="gold"), ha='center')
ax.annotate("", xy=(0.4, 0.6), xytext=(0.3, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.6, 0.6), xytext=(0.7, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("Total Reward", xy=(0.5, 0.4), xytext=(0.5, 0.3), arrowprops=dict(arrowstyle="<-"))
def plot_psro_meta_game(ax):
"""Multi-Agent: PSRO Meta-Game Tree."""
ax.axis('off')
ax.set_title("PSRO Meta-Game Update", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Meta-Game Matrix", bbox=dict(fc="ivory"), ha='center')
ax.text(0.2, 0.5, "Best Response", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.5, "Nash Equilibrium", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.5, 0.2, "Add Oracle Policy", bbox=dict(fc="gold"), ha='center', fontweight='bold')
ax.annotate("", xy=(0.3, 0.6), xytext=(0.45, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.7, 0.6), xytext=(0.55, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.5, 0.3), xytext=(0.3, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_dial_comm_channel(ax):
"""Multi-Agent: DIAL Comm Channel."""
ax.axis('off')
ax.set_title("DIAL: Differentiable Comm", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, "Agent 1", bbox=dict(boxstyle="circle", fc="lightblue"), ha='center')
ax.text(0.8, 0.5, "Agent 2", bbox=dict(boxstyle="circle", fc="lightblue"), ha='center')
ax.annotate("Message $m$ (Differentiable)", xy=(0.7, 0.52), xytext=(0.3, 0.52), arrowprops=dict(arrowstyle="->", lw=2, color='orange'))
ax.annotate("Gradient $\\nabla m$", xy=(0.3, 0.48), xytext=(0.7, 0.48), arrowprops=dict(arrowstyle="->", lw=1, color='blue', ls='--'))
def plot_fqi_batch_loop(ax):
"""Batch RL: Fitted Q-Iteration (FQI)."""
ax.axis('off')
ax.set_title("Fitted Q-Iteration Loop", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, r"Dataset $\mathcal{D}$", bbox=dict(boxstyle="round", fc="ivory"), ha='center')
ax.text(0.5, 0.5, "Supervised Regressor", bbox=dict(fc="orange", alpha=0.3), ha='center')
ax.text(0.5, 0.2, "Updated $Q_{k+1}$", bbox=dict(fc="lightgreen"), ha='center')
ax.annotate("", xy=(0.5, 0.6), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.5, 0.3), xytext=(0.5, 0.45), arrowprops=dict(arrowstyle="->"))
ax.annotate("Bootstrap", xy=(0.8, 0.3), xytext=(0.8, 0.7), arrowprops=dict(arrowstyle="<-", connectionstyle="arc3,rad=-0.5"))
def plot_cmdp_feasible_set(ax):
"""Safety RL: CMDP Feasible Set."""
ax.set_title("CMDP Feasible Region", fontsize=12, fontweight='bold')
circle = plt.Circle((0, 0), 1, alpha=0.2, color='green', label="Constrained Feasible Set")
ax.add_patch(circle)
ax.axhline(0.7, color='red', ls='--', label=r"Constraint $J \leq C$")
ax.text(0, -0.3, r"Optimized Policy $\pi^*$", color='blue', fontweight='bold', ha='center')
ax.set_xlim(-1.5, 1.5); ax.set_ylim(-1.5, 1.5)
ax.legend(fontsize=8)
def plot_mpc_vs_rl_horizon(ax):
"""Control: MPC vs RL Comparison."""
ax.axis('off')
ax.set_title("MPC vs RL Planning", fontsize=12, fontweight='bold')
ax.text(0.25, 0.8, "MPC", fontweight='bold')
ax.text(0.75, 0.8, "RL", fontweight='bold')
ax.text(0.25, 0.5, "Receding Horizon\nPlanning at every step", ha='center', fontsize=8)
ax.text(0.75, 0.5, "Direct Mapping from\nState to Action (Policy)", ha='center', fontsize=8)
ax.text(0.5, 0.2, "Convergent when Model is Exact", color='grey', ha='center', fontsize=7)
def plot_l2o_meta_pipeline(ax):
"""AutoML: Learning to Optimize (L2O)."""
ax.axis('off')
ax.set_title("Learning to Optimize (L2O)", fontsize=12, fontweight='bold')
ax.text(0.5, 0.7, "Optimizer (RL Policy)", bbox=dict(fc="cornflowerblue"), ha='center')
ax.text(0.5, 0.3, "Optimizee (Deep Net)", bbox=dict(fc="lightgrey"), ha='center')
ax.annotate(r"Step $\Delta w$", xy=(0.5, 0.4), xytext=(0.5, 0.6), arrowprops=dict(arrowstyle="->"))
ax.annotate(r"Gradient $\nabla L$", xy=(0.2, 0.6), xytext=(0.2, 0.4), arrowprops=dict(arrowstyle="->", color='red'))
def plot_chip_placement_rl(ax):
"""Applied: RL for Chip Placement."""
ax.set_title("RL for Chip Placement", fontsize=12, fontweight='bold')
ax.grid(True, ls='--', alpha=0.3)
for _ in range(8):
pos = np.random.rand(2)
rect = plt.Rectangle(pos, 0.1, 0.1, facecolor='lightblue', edgecolor='blue', alpha=0.7)
ax.add_patch(rect)
ax.set_xlim(0, 1); ax.set_ylim(0, 1)
ax.text(0.5, -0.15, "Optimizing Macro Placement on Silicon", ha='center', fontsize=8)
def plot_compiler_mlgo(ax):
"""Applied: RL for Compiler Optimization (MLGO)."""
ax.axis('off')
ax.set_title("MLGO: Compiler RL", fontsize=12, fontweight='bold')
G = nx.DiGraph()
G.add_edges_from([(0,1), (0,2), (1,3), (2,3)])
pos = {0: (0.5, 0.9), 1: (0.3, 0.6), 2: (0.7, 0.6), 3: (0.5, 0.3)}
nx.draw(G, pos, ax=ax, node_color='lightgreen', with_labels=False)
ax.text(0.5, 0.1, "Control Flow Graph (CFG) + Inline Policy", ha='center', fontsize=8)
def plot_theorem_proving_rl(ax):
"""Applied: RL for Theorem Proving."""
ax.axis('off')
ax.set_title("RL for Theorem Proving", fontsize=12, fontweight='bold')
ax.text(0.5, 0.9, "Target Theorem", bbox=dict(fc="ivory"), ha='center')
ax.text(0.3, 0.5, "Proof Step $a$", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.7, 0.5, "Heuristic $V(s)$", bbox=dict(fc="gold"), ha='center')
ax.text(0.5, 0.2, "Verified Proof Tree", ha='center', fontsize=8)
ax.annotate("", xy=(0.35, 0.6), xytext=(0.45, 0.8), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.65, 0.6), xytext=(0.55, 0.8), arrowprops=dict(arrowstyle="->"))
def plot_diffusion_ql_loop(ax):
"""Modern: Diffusion-QL Offline RL."""
ax.axis('off')
ax.set_title("Diffusion-QL Training", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, r"Noise $\epsilon$", ha='center')
ax.text(0.5, 0.5, r"Denoising MLP\n$\pi_\theta(a|s, k)$", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.8, 0.5, "Action $a$", ha='center')
ax.annotate("", xy=(0.35, 0.5), xytext=(0.25, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.65, 0.5), xytext=(1.0, 0.5), arrowprops=dict(arrowstyle="<-"))
ax.text(0.5, 0.2, "Policy as a Reverse Diffusion Process", fontsize=8, ha='center')
def plot_fairness_rl_pareto(ax):
"""Principles: Fairness-aware RL Pareto."""
ax.set_title("Fairness-Reward Pareto Frontier", fontsize=12, fontweight='bold')
x = np.linspace(0.1, 1, 100)
y = 1 - x**2
ax.plot(x, y, color='purple', lw=3, label="Pareto Frontier")
ax.fill_between(x, 0, y, color='purple', alpha=0.1)
ax.set_xlabel("Reward $R$"); ax.set_ylabel("Fairness Metric $F$")
ax.legend(fontsize=8)
def plot_dp_rl_noise(ax):
"""Principles: Differentially Private RL."""
ax.axis('off')
ax.set_title("Differentially Private RL", fontsize=12, fontweight='bold')
ax.text(0.3, 0.5, r"Algorithm $\mathcal{A}$", bbox=dict(fc="ivory"), ha='center')
ax.text(0.5, 0.5, r"$\mathcal{N}(0, \sigma^2 \mathbb{I})$", bbox=dict(fc="red", alpha=0.3), ha='center')
ax.text(0.7, 0.5, r"Privacy Budget $\epsilon, \delta$", bbox=dict(fc="lightgrey"), ha='center')
ax.annotate("", xy=(0.4, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.6, 0.5), xytext=(0.45, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_smart_agriculture_rl(ax):
"""Applied: Smart Agriculture RL."""
ax.axis('off')
ax.set_title("Smart Agriculture RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Soil/Weather Sensors", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.5, 0.5, "Irrigation Policy", bbox=dict(fc="gold"), ha='center')
ax.text(0.5, 0.2, "Yield Optimization", bbox=dict(fc="lightgreen"), ha='center')
ax.annotate("", xy=(0.5, 0.6), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.5, 0.3), xytext=(0.5, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_climate_rl_grid(ax):
"""Applied: Climate Science RL."""
ax.set_title("Climate Mitigation RL (Grid)", fontsize=12, fontweight='bold')
data = np.random.randn(10, 10)
im = ax.imshow(data, cmap='coolwarm')
ax.set_xlabel("Longitude"); ax.set_ylabel("Latitude")
ax.text(5, 5, "Carbon Sequestration\nControl Map", ha='center', color='white', fontweight='bold', fontsize=8)
def plot_ai_education_tracing(ax):
"""Applied: Intelligent Tutoring Systems RL."""
ax.axis('off')
ax.set_title("AI Education (Knowledge Tracing)", fontsize=12, fontweight='bold')
nodes = ["Concept 1", "Concept 2", "Student State $S_t$", "Next Problem $a_t$"]
for i, n in enumerate(nodes):
ax.text(0.2 + (i%2)*0.6, 0.7 - (i//2)*0.4, n, bbox=dict(fc="pink", alpha=0.3), ha='center')
ax.annotate("", xy=(0.6, 0.5), xytext=(0.4, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_decision_sde_flow(ax):
"""Modern: Decision SDEs."""
ax.set_title(r"Decision SDE Flow $dX_t = f(X_t, u_t)dt + gdW_t$", fontsize=10, fontweight='bold')
t = np.linspace(0, 1, 100)
for _ in range(5):
path = np.cumsum(np.random.normal(0, 0.1, size=100))
ax.plot(t, path + 0.5*t, alpha=0.5)
ax.set_xlabel("Continuous Time $t$")
def plot_diff_physics_brax(ax):
"""Control: Differentiable Physics (Brax)."""
ax.axis('off')
ax.set_title(r"Differentiable physics $\nabla_{u} \mathcal{L}$", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Physics Engine (Jacobian)", bbox=dict(fc="orange", alpha=0.1), ha='center')
ax.text(0.5, 0.5, "Simulator Layer", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.5, 0.2, "Policy Update", bbox=dict(fc="blue", alpha=0.1), ha='center')
ax.annotate("", xy=(0.5, 0.4), xytext=(0.5, 0.6), arrowprops=dict(arrowstyle="<-", color='red', label="Grads"))
def plot_beamforming_rl(ax):
"""Applied: RL for Beamforming."""
ax.axis('off')
ax.set_title("Wireless Beamforming RL", fontsize=12, fontweight='bold')
ax.add_patch(plt.Circle((0.2, 0.5), 0.05, color='black'))
theta = np.linspace(-np.pi/4, np.pi/4, 100)
r = np.cos(4*theta)
ax.plot(0.2 + r*np.cos(theta), 0.5 + r*np.sin(theta), color='orange', label="Main Lobe")
ax.text(0.8, 0.5, "User Device", bbox=dict(boxstyle="round", fc="lightgrey"), ha='center')
def plot_quantum_error_correction_rl(ax):
"""Applied: Quantum Error Correction RL."""
ax.axis('off')
ax.set_title("Quantum Error Correction RL", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "Syndrome $S$", bbox=dict(fc="ivory"), ha='center')
ax.text(0.5, 0.5, "Decoder Agent", bbox=dict(boxstyle="round4", fc="purple", alpha=0.2), ha='center')
ax.text(0.9, 0.5, "Recovery $P$", bbox=dict(fc="gold"), ha='center')
ax.annotate("", xy=(0.4, 0.5), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_mean_field_rl(ax):
"""Multi-Agent: Mean Field RL."""
ax.axis('off')
ax.set_title("Mean Field RL Interaction", fontsize=12, fontweight='bold')
x = np.random.randn(50)
ax.text(0.2, 0.5, "Single Agent $i$", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.5, r"Mean State $\overline{s}$", bbox=dict(fc="white"), ha='center', fontweight='bold')
ax.annotate("", xy=(0.7, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="<->"))
ax.text(0.5, 0.2, r"Population Limit $N \rightarrow \infty$", ha='center', fontsize=8)
def plot_goal_gan_hrl(ax):
"""HRL: Goal-GAN Pipeline."""
ax.axis('off')
ax.set_title("Goal-GAN Curriculum", fontsize=12, fontweight='bold')
ax.text(0.2, 0.7, "Goal Generator\n(GAN Ref)", bbox=dict(fc="gold"), ha='center')
ax.text(0.8, 0.7, "RL Policy\n(Worker)", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.5, 0.3, "Goal Label (Success/Fail)", bbox=dict(fc="ivory"), ha='center')
ax.annotate("Set Goal $g$", xy=(0.7, 0.7), xytext=(0.3, 0.7), arrowprops=dict(arrowstyle="->"))
ax.annotate("Train GAN", xy=(0.3, 0.4), xytext=(0.5, 0.35), arrowprops=dict(arrowstyle="->"))
def plot_jepa_arch(ax):
"""Modern: JEPA (Joint Embedding Predictive Architecture)."""
ax.axis('off')
ax.set_title("JEPA: Predictive Architecture", fontsize=12, fontweight='bold')
ax.text(0.2, 0.2, "Context $x$", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.8, 0.2, "Target $y$", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.2, 0.6, "Encoder $E_x$", bbox=dict(fc="cornflowerblue"), ha='center')
ax.text(0.8, 0.6, "Encoder $E_y$", bbox=dict(fc="cornflowerblue"), ha='center')
ax.text(0.5, 0.8, "Predictor $P$", bbox=dict(fc="orange", alpha=0.3), ha='center')
for i in [0.2, 0.8]:
ax.annotate("", xy=(i, 0.5), xytext=(i, 0.3), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.4, 0.75), xytext=(0.25, 0.65), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.6, 0.75), xytext=(0.75, 0.65), arrowprops=dict(arrowstyle="->"))
def plot_cql_penalty_surface(ax):
"""Offline RL: CQL Value Penalty."""
X, Y = np.meshgrid(np.linspace(-3, 3, 20), np.linspace(-3, 3, 20))
Z = (X**2 + Y**2) - 2 * np.exp(- (X**2 + Y**2)) # CQL lower bound
ax.contourf(X, Y, Z, cmap='viridis')
ax.set_title("CQL Value Penalty Landscape", fontsize=12, fontweight='bold')
def plot_cyber_attack_defense(ax):
"""Applied: Cybersecurity RL Game."""
ax.axis('off')
ax.set_title("Cybersecurity Attack-Defense RL", fontsize=12, fontweight='bold')
ax.text(0.2, 0.7, "Attacker Agent", bbox=dict(fc="red", alpha=0.2), ha='center', fontweight='bold')
ax.text(0.8, 0.7, "Defender Agent", bbox=dict(fc="blue", alpha=0.2), ha='center', fontweight='bold')
ax.text(0.5, 0.3, "Network Infrastructure", bbox=dict(fc="grey", alpha=0.3), ha='center')
ax.annotate("Intrusion", xy=(0.4, 0.4), xytext=(0.2, 0.6), arrowprops=dict(arrowstyle="->", color='red'))
ax.annotate("Mitigation", xy=(0.6, 0.4), xytext=(0.8, 0.6), arrowprops=dict(arrowstyle="->", color='blue'))
def plot_causal_irl(ax):
"""Causal: Causal Inverse RL Graph."""
ax.axis('off')
ax.set_title("Causal Inverse RL Graph", fontsize=12, fontweight='bold')
ax.text(0.2, 0.8, "State $S$", ha='center', bbox=dict(fc="ivory"))
ax.text(0.5, 0.8, "Latent Factor $U$", ha='center', bbox=dict(fc="red", alpha=0.1), fontweight='bold')
ax.text(0.2, 0.4, "Action $A$", ha='center', bbox=dict(fc="lightblue"))
ax.text(0.5, 0.4, "Reward $R$", ha='center', bbox=dict(fc="gold"))
ax.annotate("", xy=(0.2, 0.5), xytext=(0.2, 0.7), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.4, 0.5), xytext=(0.5, 0.7), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.3, 0.4), xytext=(0.4, 0.4), arrowprops=dict(arrowstyle="->"))
def plot_vqe_rl(ax):
"""Quantum: VQE-RL Circuit Optimization."""
ax.axis('off')
ax.set_title("VQE-RL Optimization", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Quantum Circuit $U(\\theta)$", bbox=dict(fc="purple", alpha=0.1), ha='center')
ax.text(0.5, 0.5, "Energy Expectation $\\langle H \\rangle$", ha='center')
ax.text(0.5, 0.2, "RL Optimizer", bbox=dict(fc="lightblue"), ha='center')
ax.annotate("", xy=(0.5, 0.6), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.5, 0.3), xytext=(0.5, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_drug_discovery_rl(ax):
"""Applied: RL for Drug Discovery."""
ax.axis('off')
ax.set_title("De-novo Drug Discovery RL", fontsize=12, fontweight='bold')
ax.text(0.1, 0.5, "Seed Molecule", ha='center')
ax.text(0.5, 0.5, "RL Modification Step", bbox=dict(fc="green", alpha=0.1), ha='center')
ax.text(0.9, 0.5, "Optimized Lead", ha='center')
ax.annotate("", xy=(0.4, 0.5), xytext=(0.2, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.8, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_traffic_signal_coordination(ax):
"""Applied: Traffic Signal RL."""
ax.axis('off')
ax.set_title("Traffic Signal Coordination RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Intersection Grid", ha='center')
ax.text(0.2, 0.5, "Signal A (RL)", bbox=dict(fc="red"), ha='center')
ax.text(0.8, 0.5, "Signal B (RL)", bbox=dict(fc="green"), ha='center')
ax.annotate("Max-Pressure Reward", xy=(0.7, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="<->", color='orange'))
def plot_mars_rover_pathfinding(ax):
"""Applied: Mars Rover RL."""
ax.set_title("Mars Rover Pathfinding RL", fontsize=12, fontweight='bold')
x = np.linspace(0, 5, 20)
y = np.sin(x) + np.cos(x*0.5)
ax.plot(x, y, color='brown', lw=2, label="Terrain")
ax.scatter([1, 4], [y[4], y[16]], color='red', label="Waypoints")
ax.legend(fontsize=8)
def plot_sports_analytics_rl(ax):
"""Applied: Sports Analytics RL."""
ax.set_title("Sports Player Movement RL", fontsize=12, fontweight='bold')
x = np.random.normal(0, 1, 50)
y = np.random.normal(0, 1, 50)
ax.hexbin(x, y, gridsize=10, cmap='Blues')
ax.text(0, 0, "High Pressure Zone", ha='center', color='black', alpha=0.5)
def plot_crypto_attack_rl(ax):
"""Applied: Cryptography Attack RL."""
ax.axis('off')
ax.set_title("Cryptography Attack RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Cipher State", ha='center')
ax.text(0.5, 0.5, "Differential Cryptanalysis Search", bbox=dict(fc="red", alpha=0.1), ha='center')
ax.text(0.5, 0.2, "Broken Key Found", ha='center', fontweight='bold')
ax.annotate("", xy=(0.5, 0.3), xytext=(0.5, 0.7), arrowprops=dict(arrowstyle="->"))
def plot_humanitarian_rl(ax):
"""Applied: Humanitarian Aid RL."""
ax.axis('off')
ax.set_title("Humanitarian Resource RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Disaster Zone Clusters", ha='center')
ax.text(0.2, 0.4, "Supply Hub", bbox=dict(fc="lightgrey"), ha='center')
ax.text(0.8, 0.4, "Isolated Community", bbox=dict(fc="red", alpha=0.1), ha='center')
ax.annotate("Optimal Cargo Drop", xy=(0.7, 0.4), xytext=(0.3, 0.4), arrowprops=dict(arrowstyle="->", lw=2, color='blue'))
def plot_video_compression_rl(ax):
"""Applied: Video Compression RL."""
ax.set_title("Video Compression RL (Rate-Distortion)", fontsize=12, fontweight='bold')
bitrate = np.logspace(0, 10, 100)
distortion = 1 / bitrate
ax.loglog(bitrate, distortion, label=r"Policy $\pi$ RD curve")
ax.set_xlabel("Bit Rate"); ax.set_ylabel("Distortion")
ax.legend()
def plot_kubernetes_scaling_rl(ax):
"""Applied: Kubernetes Scaling RL."""
ax.axis('off')
ax.set_title("Kubernetes Auto-scaling RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Cloud Service Load", ha='center')
ax.text(0.5, 0.5, "RL Autoscaler", bbox=dict(fc="gold"), ha='center')
ax.text(0.5, 0.2, "Replicas count $n$", ha='center')
ax.annotate("", xy=(0.5, 0.3), xytext=(0.5, 0.7), arrowprops=dict(arrowstyle="->"))
def plot_fluid_dynamics_rl(ax):
"""Applied: Fluid Dynamics Control RL."""
ax.set_title("Fluid Dynamics Flow Control RL", fontsize=12, fontweight='bold')
Y, X = np.mgrid[-1:1:100j, -1:1:100j]
U = -1 - X**2 + Y
V = 1 + X - Y**2
ax.streamplot(X, Y, U, V, color='cornflowerblue')
ax.set_title("Flow Control optimization")
def plot_structural_optimization_rl(ax):
"""Applied: Structural RL."""
ax.axis('off')
ax.set_title("Structural Optimization RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Bridge Topology State", ha='center')
ax.text(0.5, 0.5, "Agent Stress Estimation", ha='center')
ax.text(0.5, 0.2, "Material Placement Action", ha='center', bbox=dict(fc="lightgrey"))
def plot_human_decision_rl(ax):
"""Applied: Human Modeling RL."""
ax.set_title("Human Decision Modeling (Prospect Theory)", fontsize=12, fontweight='bold')
x = np.linspace(-10, 10, 100)
y = np.where(x > 0, x**0.88, -2.25*(-x)**0.88)
ax.plot(x, y, label="Human Value Function")
ax.axvline(0, color='black', alpha=0.2)
ax.legend()
def plot_semantic_parsing_rl(ax):
"""Applied: Semantic Parsing RL."""
ax.axis('off')
ax.set_title("Semantic Parsing RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Natural Language Input", bbox=dict(fc="ivory"), ha='center')
ax.text(0.5, 0.5, "Logic derivation step $a$", ha='center')
ax.text(0.5, 0.2, "SQL/Lambda Expr Tree", ha='center', fontweight='bold')
def plot_music_melody_rl(ax):
"""Applied: Music Composition RL."""
ax.axis('off')
ax.set_title("Melody generation RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.5, "Sequence of Notes\n(C, E, G, B...)", bbox=dict(fc="ivory"), ha='center')
ax.text(0.5, 0.2, "Aesthetic Reward Model", bbox=dict(fc="pink"), ha='center')
def plot_plasma_control_rl(ax):
"""Applied: Plasma Fusion RL."""
ax.axis('off')
ax.set_title("Plasma Fusion Control RL", fontsize=12, fontweight='bold')
ax.add_patch(plt.Circle((0.5, 0.5), 0.3, color='orange', alpha=0.5, label="Plasma"))
ax.text(0.5, 0.5, "Tokamak Center", ha='center')
ax.annotate("Magnetic Coil Action", xy=(0.8, 0.8), xytext=(0.6, 0.6), arrowprops=dict(arrowstyle="->", color='red'))
def plot_carbon_capture_rl(ax):
"""Applied: Carbon Capture RL."""
ax.axis('off')
ax.set_title("Carbon Capture RL cycle", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, "Adsorption", bbox=dict(fc="lightgreen"), ha='center')
ax.text(0.8, 0.5, "Desorption", bbox=dict(fc="orange"), ha='center')
ax.annotate("", xy=(0.7, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.3, 0.45), xytext=(0.7, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_swarm_robotics_rl(ax):
"""Applied: Swarm RL."""
ax.axis('off')
ax.set_title("Swarm Robotics RL", fontsize=12, fontweight='bold')
for _ in range(10):
pos = np.random.rand(2)
ax.add_patch(plt.Circle(pos, 0.02, color='black'))
ax.text(0.5, 1, "Emergent Coordination Plan", ha='center')
def plot_legal_compliance_rl(ax):
"""Applied: Legal RL."""
ax.axis('off')
ax.set_title("Legal Compliance RL Game", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, r"Regulation $\mathcal{L}$", ha='center')
ax.text(0.8, 0.5, r"Compliance Policy $\pi$", ha='center', bbox=dict(fc="gold"))
ax.annotate("Audit", xy=(0.2, 0.4), xytext=(0.8, 0.4), arrowprops=dict(arrowstyle="->", ls='--'))
def plot_pinn_rl_loss(ax):
"""Physics: Physics-Informed RL (PINN)."""
ax.axis('off')
ax.set_title("Physics-Informed RL (PINN)", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, r"RL Loss $\mathcal{L}_{RL}$", ha='center')
ax.text(0.5, 0.5, r"PDE Constraint $\mathcal{L}_{Phys}$", ha='center', color='red')
ax.text(0.5, 0.2, r"Optimized Policy $\pi_\theta$", fontweight='bold', ha='center')
def plot_neuro_symbolic_rl(ax):
"""Modern: Neuro-Symbolic RL."""
ax.axis('off')
ax.set_title("Neuro-Symbolic RL", fontsize=12, fontweight='bold')
ax.text(0.2, 0.5, "Neural State", bbox=dict(fc="lightblue"), ha='center')
ax.text(0.8, 0.5, "Symbolic Logic", bbox=dict(fc="lightgreen"), ha='center')
ax.annotate("Abstraction", xy=(0.7, 0.5), xytext=(0.3, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_defi_liquidity_rl(ax):
"""Applied: DeFi RL."""
ax.axis('off')
ax.set_title("DeFi Liquidity Pool RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.7, "Liquidity Pool $(x, y)$", bbox=dict(fc="gold"), ha='center')
ax.text(0.5, 0.3, "LP Strategy Policy", ha='center')
ax.annotate("Arbitrage Action", xy=(1.0, 0.5), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_dopamine_rpe_curves(ax):
"""Neuroscience: Dopamine RPE."""
ax.set_title("Dopamine Reward Prediction Error", fontsize=12, fontweight='bold')
t = np.linspace(0, 10, 100)
rpe = np.exp(-t)
ax.plot(t, rpe, label=r"Expected RPE $\delta$")
ax.set_ylabel("Dopamine neurons firing rate")
ax.legend()
def plot_proprioceptive_rl_loop(ax):
"""Robotics: Proprioceptive RL."""
ax.axis('off')
ax.set_title("Proprioceptive Sensory-Motor RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Joint Encoders", ha='center')
ax.text(0.5, 0.4, "Low-level Controller", bbox=dict(fc="lightgrey"), ha='center')
ax.annotate("", xy=(0.5, 0.5), xytext=(0.5, 0.7), arrowprops=dict(arrowstyle="->"))
def plot_ar_placement_rl(ax):
"""Applied: AR RL."""
ax.axis('off')
ax.set_title("Augmented Reality Object Placement RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.5, "[AR View of Room]", ha='center')
ax.text(0.8, 0.8, "Optimal overlay position", ha='center', color='blue')
def plot_sequential_bundle_rl(ax):
"""Recommendation: Sequential Bundle RL."""
ax.axis('off')
ax.set_title("Sequential Bundle RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "User context sequence", ha='center')
nodes = ["Item 1", "Item 2", "Item 3"]
for i, n in enumerate(nodes):
ax.text(0.2 + i*0.3, 0.5, n, bbox=dict(fc="ivory"), ha='center')
def plot_ogd_vs_rl_gradient(ax):
"""Theoretical: OGD vs RL."""
ax.set_title("Online Gradient Descent vs RL", fontsize=12, fontweight='bold')
x = np.linspace(-3, 3, 100)
ax.plot(x, x**2, label="OGD loss curve")
ax.plot(x, -np.log(1/(1+np.exp(-x))), label="RL surrogate loss", ls='--')
ax.legend()
def plot_active_learning_selection(ax):
"""Modern: Active Learning RL."""
ax.axis('off')
ax.set_title("Active Learning: Query RL Selection", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Pool of unlabeled samples", ha='center')
ax.text(0.5, 0.5, r"Acquisition Policy $\pi$", bbox=dict(fc="gold"), ha='center')
ax.annotate("Send to Oracle", xy=(1.0, 0.5), xytext=(0.7, 0.5), arrowprops=dict(arrowstyle="->"))
def plot_federated_rl_tree(ax):
"""Modern: Federated RL."""
ax.axis('off')
ax.set_title("Federated RL global Aggregator", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Global Model / Server", bbox=dict(fc="purple", alpha=0.1), ha='center')
for i in [0.2, 0.5, 0.8]:
ax.text(i, 0.4, f"Local Agent {int(i*3)}", bbox=dict(fc="lightgrey"), ha='center')
ax.annotate("", xy=(0.5, 0.75), xytext=(i, 0.45), arrowprops=dict(arrowstyle="<->"))
def plot_ultimate_mastery_diagram(ax):
"""Conceptual: Ultimate Universal RL Mastery Diagram."""
ax.axis('off')
ax.set_title("UTIMATE UNIVERSAL RL MASTERY MILESTONE (230)", fontsize=16, fontweight='bold', color='darkred')
ax.text(0.5, 0.5, "The Definitive\nMaster Anthology\nof Reinforcement Learning", ha='center', fontsize=12, fontweight='bold')
ax.add_patch(plt.Circle((0.5, 0.5), 0.4, color='gold', alpha=0.05, lw=5, edgecolor='black'))
ax.text(0.5, 0.1, "230 UNIQUE GRAPHICAL REPRESENTATIONS ACHIEVED", ha='center', fontweight='bold')
def plot_smart_grid_rl(ax):
"""Applied: Smart Grid Supply/Demand."""
ax.axis('off')
ax.set_title("Smart Grid RL Management", fontsize=12, fontweight='bold')
ax.text(0.2, 0.8, "Renewables", ha='center')
ax.text(0.8, 0.8, "Consumers", ha='center')
ax.text(0.5, 0.5, "RL Dispatcher", bbox=dict(fc="gold"), ha='center')
ax.text(0.5, 0.2, "Energy Storage", bbox=dict(fc="lightgrey"), ha='center')
ax.annotate("", xy=(0.4, 0.55), xytext=(0.25, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.75, 0.75), xytext=(0.6, 0.6), arrowprops=dict(arrowstyle="<-"))
ax.annotate("", xy=(0.5, 0.3), xytext=(0.5, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_quantum_tomography_rl(ax):
"""Applied: Quantum State Tomography."""
ax.axis('off')
ax.set_title("Quantum state Tomography RL", fontsize=12, fontweight='bold')
ax.text(0.5, 0.8, "Quantum State $\\rho$", bbox=dict(boxstyle="circle", fc="purple", alpha=0.2), ha='center')
ax.text(0.5, 0.5, "Measurement $M$", ha='center')
ax.text(0.5, 0.2, "RL Estimator", bbox=dict(fc="lightblue"), ha='center')
ax.annotate("", xy=(0.5, 0.6), xytext=(0.5, 0.75), arrowprops=dict(arrowstyle="->"))
ax.annotate("", xy=(0.5, 0.3), xytext=(0.5, 0.45), arrowprops=dict(arrowstyle="->"))
def plot_absolute_encyclopedia_map(ax):
"""Conceptual: Absolute Universal Encyclopedia Map."""
ax.axis('off')
ax.set_title("Absolute Universal RL Pillar Map", fontsize=14, fontweight='bold', color='darkblue')
categories = ["Foundational", "Model-Free", "Model-Based", "Advanced Paradigms", "Analysis/Safety", "Applied Pipelines"]
for i, c in enumerate(categories):
angle = 2 * np.pi * i / 6
ax.text(0.5 + 0.35*np.cos(angle), 0.5 + 0.35*np.sin(angle), c, bbox=dict(fc="ivory", lw=2), ha='center', fontsize=9)
ax.text(0.5, 0.5, "Reinforcement\nLearning\nGraphical\nLibrary", ha='center', fontweight='bold', fontsize=12)
for i in range(6):
angle = 2 * np.pi * i / 6
ax.annotate("", xy=(0.5 + 0.25*np.cos(angle), 0.5 + 0.25*np.sin(angle)), xytext=(0.5, 0.5), arrowprops=dict(arrowstyle="->", alpha=0.3))
def plot_actor_critic_arch(ax):
"""Actor-Critic: Three-network diagram (TD3 - actor + two critics)."""
ax.axis('off')
ax.set_title("TD3 Architecture Diagram", fontsize=12, fontweight='bold')
# State input
ax.text(0.1, 0.5, r"State" + "\n" + r"$s$", ha="center", va="center", bbox=dict(boxstyle="circle,pad=0.5", fc="lightblue"))
# Networks
net_props = dict(boxstyle="square,pad=0.8", fc="lightgreen", ec="black")
ax.text(0.5, 0.8, r"Actor $\pi_\phi$", ha="center", va="center", bbox=net_props)
ax.text(0.5, 0.5, r"Critic 1 $Q_{\theta_1}$", ha="center", va="center", bbox=net_props)
ax.text(0.5, 0.2, r"Critic 2 $Q_{\theta_2}$", ha="center", va="center", bbox=net_props)
# Outputs
ax.text(0.8, 0.8, "Action $a$", ha="center", va="center", bbox=dict(boxstyle="circle,pad=0.3", fc="coral"))
ax.text(0.8, 0.35, "Min Q-value", ha="center", va="center", bbox=dict(boxstyle="round,pad=0.3", fc="gold"))
# Connections
kwargs = dict(arrowstyle="->", lw=1.5)
ax.annotate("", xy=(0.38, 0.8), xytext=(0.15, 0.55), arrowprops=kwargs) # S -> Actor
ax.annotate("", xy=(0.38, 0.5), xytext=(0.15, 0.5), arrowprops=kwargs) # S -> C1
ax.annotate("", xy=(0.38, 0.2), xytext=(0.15, 0.45), arrowprops=kwargs) # S -> C2
ax.annotate("", xy=(0.73, 0.8), xytext=(0.62, 0.8), arrowprops=kwargs) # Actor -> Action
ax.annotate("", xy=(0.68, 0.35), xytext=(0.62, 0.5), arrowprops=kwargs) # C1 -> Min
ax.annotate("", xy=(0.68, 0.35), xytext=(0.62, 0.2), arrowprops=kwargs) # C2 -> Min
def plot_epsilon_decay(ax):
"""Exploration: ε-Greedy Strategy Decay Curve."""
episodes = np.arange(0, 1000)
epsilon = np.maximum(0.01, np.exp(-0.005 * episodes)) # Exponential decay
ax.plot(episodes, epsilon, color='purple', lw=2)
ax.set_title(r"$\epsilon$-Greedy Decay Curve", fontsize=12, fontweight='bold')
ax.set_xlabel("Episodes")
ax.set_ylabel(r"Probability $\epsilon$")
ax.grid(True, linestyle='--', alpha=0.6)
ax.fill_between(episodes, epsilon, color='purple', alpha=0.1)
def plot_learning_curve(ax):
"""Advanced / Misc: Learning Curve with Confidence Bands."""
steps = np.linspace(0, 1e6, 100)
# Simulate a learning curve converging to a maximum
mean_return = 100 * (1 - np.exp(-5e-6 * steps)) + np.random.normal(0, 2, len(steps))
std_dev = 15 * np.exp(-2e-6 * steps) # Variance decreases as policy stabilizes
ax.plot(steps, mean_return, color='blue', lw=2, label="PPO (Mean)")
ax.fill_between(steps, mean_return - std_dev, mean_return + std_dev, color='blue', alpha=0.2, label="±1 Std Dev")
ax.set_title("Learning Curve (Return vs Steps)", fontsize=12, fontweight='bold')
ax.set_xlabel("Environment Steps")
ax.set_ylabel("Average Episodic Return")
ax.legend(loc="lower right")
ax.grid(True, linestyle='--', alpha=0.6)
def main():
# Figure 1: MDP & Environment (7 plots)
fig1, gs1 = setup_figure("RL: MDP & Environment", 2, 4)
plot_agent_env_loop(fig1.add_subplot(gs1[0, 0]))
plot_mdp_graph(fig1.add_subplot(gs1[0, 1]))
plot_trajectory(fig1.add_subplot(gs1[0, 2]))
plot_continuous_space(fig1.add_subplot(gs1[0, 3]))
plot_reward_landscape(fig1, gs1) # projection='3d' handled inside
plot_discount_decay(fig1.add_subplot(gs1[1, 1]))
# row 5 (State Transition Graph) is basically plot_mdp_graph
# Layout handled by constrained_layout=True
# Figure 2: Value, Policy & Dynamic Programming
fig2, gs2 = setup_figure("RL: Value, Policy & Dynamic Programming", 2, 4)
plot_value_heatmap(fig2.add_subplot(gs2[0, 0]))
plot_action_value_q(fig2.add_subplot(gs2[0, 1]))
plot_policy_arrows(fig2.add_subplot(gs2[0, 2]))
plot_advantage_function(fig2.add_subplot(gs2[0, 3]))
plot_backup_diagram(fig2.add_subplot(gs2[1, 0])) # Policy Eval
plot_policy_improvement(fig2.add_subplot(gs2[1, 1]))
plot_value_iteration_backup(fig2.add_subplot(gs2[1, 2]))
plot_policy_iteration_cycle(fig2.add_subplot(gs2[1, 3]))
# Layout handled by constrained_layout=True
# Figure 3: Monte Carlo & Temporal Difference
fig3, gs3 = setup_figure("RL: Monte Carlo & Temporal Difference", 2, 4)
plot_mc_backup(fig3.add_subplot(gs3[0, 0]))
plot_mcts(fig3.add_subplot(gs3[0, 1]))
plot_importance_sampling(fig3.add_subplot(gs3[0, 2]))
plot_td_backup(fig3.add_subplot(gs3[0, 3]))
plot_nstep_td(fig3.add_subplot(gs3[1, 0]))
plot_eligibility_traces(fig3.add_subplot(gs3[1, 1]))
plot_sarsa_backup(fig3.add_subplot(gs3[1, 2]))
plot_q_learning_backup(fig3.add_subplot(gs3[1, 3]))
# Layout handled by constrained_layout=True
# Figure 4: TD Extensions & Function Approximation
fig4, gs4 = setup_figure("RL: TD Extensions & Function Approximation", 2, 4)
plot_double_q(fig4.add_subplot(gs4[0, 0]))
plot_dueling_dqn(fig4.add_subplot(gs4[0, 1]))
plot_prioritized_replay(fig4.add_subplot(gs4[0, 2]))
plot_rainbow_dqn(fig4.add_subplot(gs4[0, 3]))
plot_linear_fa(fig4.add_subplot(gs4[1, 0]))
plot_nn_layers(fig4.add_subplot(gs4[1, 1]))
plot_computation_graph(fig4.add_subplot(gs4[1, 2]))
plot_target_network(fig4.add_subplot(gs4[1, 3]))
# Layout handled by constrained_layout=True
# Figure 5: Policy Gradients, Actor-Critic & Exploration
fig5, gs5 = setup_figure("RL: Policy Gradients, Actor-Critic & Exploration", 2, 4)
plot_policy_gradient_flow(fig5.add_subplot(gs5[0, 0]))
plot_ppo_clip(fig5.add_subplot(gs5[0, 1]))
plot_trpo_trust_region(fig5.add_subplot(gs5[0, 2]))
plot_actor_critic_arch(fig5.add_subplot(gs5[0, 3]))
plot_a3c_multi_worker(fig5.add_subplot(gs5[1, 0]))
plot_sac_arch(fig5.add_subplot(gs5[1, 1]))
plot_softmax_exploration(fig5.add_subplot(gs5[1, 2]))
plot_ucb_confidence(fig5.add_subplot(gs5[1, 3]))
# Layout handled by constrained_layout=True
# Figure 6: Hierarchical, Model-Based & Offline RL
fig6, gs6 = setup_figure("RL: Hierarchical, Model-Based & Offline", 2, 4)
plot_options_framework(fig6.add_subplot(gs6[0, 0]))
plot_feudal_networks(fig6.add_subplot(gs6[0, 1]))
plot_world_model(fig6.add_subplot(gs6[0, 2]))
plot_model_planning(fig6.add_subplot(gs6[0, 3]))
plot_offline_rl(fig6.add_subplot(gs6[1, 0]))
plot_cql_regularization(fig6.add_subplot(gs6[1, 1]))
plot_epsilon_decay(fig6.add_subplot(gs6[1, 2])) # placeholder/spacer
plot_intrinsic_motivation(fig6.add_subplot(gs6[1, 3]))
# Layout handled by constrained_layout=True
# Figure 7: Multi-Agent, IRL & Meta-RL
fig7, gs7 = setup_figure("RL: Multi-Agent, IRL & Meta-RL", 2, 4)
plot_multi_agent_interaction(fig7.add_subplot(gs7[0, 0]))
plot_ctde(fig7.add_subplot(gs7[0, 1]))
plot_payoff_matrix(fig7.add_subplot(gs7[0, 2]))
plot_irl_reward_inference(fig7.add_subplot(gs7[0, 3]))
plot_gail_flow(fig7.add_subplot(gs7[1, 0]))
plot_meta_rl_nested_loop(fig7.add_subplot(gs7[1, 1]))
plot_task_distribution(fig7.add_subplot(gs7[1, 2]))
# Layout handled by constrained_layout=True
# Figure 8: Advanced / Miscellaneous Topics
fig8, gs8 = setup_figure("RL: Advanced & Miscellaneous", 2, 4)
plot_replay_buffer(fig8.add_subplot(gs8[0, 0]))
plot_state_visitation(fig8.add_subplot(gs8[0, 1]))
plot_regret_curve(fig8.add_subplot(gs8[0, 2]))
plot_attention_weights(fig8.add_subplot(gs8[0, 3]))
plot_diffusion_policy(fig8.add_subplot(gs8[1, 0]))
plot_gnn_rl(fig8.add_subplot(gs8[1, 1]))
plot_latent_space(fig8.add_subplot(gs8[1, 2]))
plot_convergence_log(fig8.add_subplot(gs8[1, 3]))
# Figure 9: Specialized & Modern RL (Advanced Gallery)
fig9, gs9 = setup_figure("RL: Specialized & Modern (Absolute Completeness)", 3, 4)
# Row 1
plot_rl_taxonomy_tree(fig9.add_subplot(gs9[0, 0]))
plot_rl_as_inference_pgm(fig9.add_subplot(gs9[0, 1]))
plot_distributional_rl_atoms(fig9.add_subplot(gs9[0, 2]))
plot_her_goal_relabeling(fig9.add_subplot(gs9[0, 3]))
# Row 2
plot_dyna_q_flow(fig9.add_subplot(gs9[1, 0]))
plot_noisy_nets_parameters(fig9.add_subplot(gs9[1, 1]))
plot_icm_curiosity(fig9.add_subplot(gs9[1, 2]))
plot_v_trace_impala(fig9.add_subplot(gs9[1, 3]))
# Row 3
plot_qmix_mixing_net(fig9.add_subplot(gs9[2, 0]))
plot_saliency_heatmaps(fig9.add_subplot(gs9[2, 1]))
plot_tsne_state_embeddings(fig9.add_subplot(gs9[2, 2]))
plot_action_selection_noise(fig9.add_subplot(gs9[2, 3]))
# Figure 10: Evaluation, Safety & Alignment
fig10, gs10 = setup_figure("RL: Evaluation, Safety & Alignment", 2, 4)
plot_success_rate_curve(fig10.add_subplot(gs10[0, 0]))
plot_performance_profiles_rliable(fig10.add_subplot(gs10[0, 1]))
plot_hyperparameter_sensitivity(fig10.add_subplot(gs10[0, 2]))
plot_action_persistence(fig10.add_subplot(gs10[0, 3]))
plot_safety_shielding(fig10.add_subplot(gs10[1, 0]))
plot_automated_curriculum(fig10.add_subplot(gs10[1, 1]))
plot_domain_randomization(fig10.add_subplot(gs10[1, 2]))
plot_rlhf_flow(fig10.add_subplot(gs10[1, 3]))
# Figure 11: Transformer & Specific MB Architecture
fig11, gs11 = setup_figure("RL: Transformers & Specific MB Architecture", 1, 3)
plot_decision_transformer_tokens(fig11.add_subplot(gs11[0, 0]))
plot_muzero_search_tree(fig11.add_subplot(gs11[0, 1]))
plot_policy_distillation(fig11.add_subplot(gs11[0, 2]))
# Special Handle for Loss Landscape in Dashboard if needed (but it's 3D)
# We skip it in the main dashboard or add it to a single 3D fig
fig_loss = plt.figure(figsize=(10, 8))
gs_loss = GridSpec(1, 1, figure=fig_loss)
plot_loss_landscape(fig_loss, gs_loss)
plt.show()
def save_all_graphs(output_dir="graphs"):
"""Saves each of the 74 RL components as a separate PNG file."""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Component-to-Function Mapping (Total 74 entries as per e.md rows)
mapping = {
"Agent-Environment Interaction Loop": plot_agent_env_loop,
"Markov Decision Process (MDP) Tuple": plot_mdp_graph,
"State Transition Graph": plot_mdp_graph,
"Trajectory / Episode Sequence": plot_trajectory,
"Continuous State/Action Space Visualization": plot_continuous_space,
"Reward Function / Landscape": plot_reward_landscape,
"Discount Factor (gamma) Effect": plot_discount_decay,
"State-Value Function V(s)": plot_value_heatmap,
"Action-Value Function Q(s,a)": plot_action_value_q,
"Policy pi(s) or pi(a|s)": plot_policy_arrows,
"Advantage Function A(s,a)": plot_advantage_function,
"Optimal Value Function V* / Q*": plot_value_heatmap,
"Policy Evaluation Backup": plot_backup_diagram,
"Policy Improvement": plot_policy_improvement,
"Value Iteration Backup": plot_value_iteration_backup,
"Policy Iteration Full Cycle": plot_policy_iteration_cycle,
"Monte Carlo Backup": plot_mc_backup,
"Monte Carlo Tree (MCTS)": plot_mcts,
"Importance Sampling Ratio": plot_importance_sampling,
"TD(0) Backup": plot_td_backup,
"Bootstrapping (general)": plot_td_backup,
"n-step TD Backup": plot_nstep_td,
"TD(lambda) & Eligibility Traces": plot_eligibility_traces,
"SARSA Update": plot_sarsa_backup,
"Q-Learning Update": plot_q_learning_backup,
"Expected SARSA": plot_expected_sarsa_backup,
"Double Q-Learning / Double DQN": plot_double_q,
"Dueling DQN Architecture": plot_dueling_dqn,
"Prioritized Experience Replay": plot_prioritized_replay,
"Rainbow DQN Components": plot_rainbow_dqn,
"Linear Function Approximation": plot_linear_fa,
"Neural Network Layers (MLP, CNN, RNN, Transformer)": plot_nn_layers,
"Computation Graph / Backpropagation Flow": plot_computation_graph,
"Target Network": plot_target_network,
"Policy Gradient Theorem": plot_policy_gradient_flow,
"REINFORCE Update": plot_reinforce_flow,
"Baseline / Advantage Subtraction": plot_advantage_scaled_grad,
"Trust Region (TRPO)": plot_trpo_trust_region,
"Proximal Policy Optimization (PPO)": plot_ppo_clip,
"Actor-Critic Architecture": plot_actor_critic_arch,
"Advantage Actor-Critic (A2C/A3C)": plot_a3c_multi_worker,
"Soft Actor-Critic (SAC)": plot_sac_arch,
"Twin Delayed DDPG (TD3)": plot_actor_critic_arch,
"epsilon-Greedy Strategy": plot_epsilon_decay,
"Softmax / Boltzmann Exploration": plot_softmax_exploration,
"Upper Confidence Bound (UCB)": plot_ucb_confidence,
"Intrinsic Motivation / Curiosity": plot_intrinsic_motivation,
"Entropy Regularization": plot_entropy_bonus,
"Options Framework": plot_options_framework,
"Feudal Networks / Hierarchical Actor-Critic": plot_feudal_networks,
"Skill Discovery": plot_skill_discovery,
"Learned Dynamics Model": plot_world_model,
"Model-Based Planning": plot_model_planning,
"Imagination-Augmented Agents (I2A)": plot_imagination_rollout,
"Offline Dataset": plot_offline_rl,
"Conservative Q-Learning (CQL)": plot_cql_regularization,
"Multi-Agent Interaction Graph": plot_multi_agent_interaction,
"Centralized Training Decentralized Execution (CTDE)": plot_ctde,
"Cooperative / Competitive Payoff Matrix": plot_payoff_matrix,
"Reward Inference": plot_irl_reward_inference,
"Generative Adversarial Imitation Learning (GAIL)": plot_gail_flow,
"Meta-RL Architecture": plot_meta_rl_nested_loop,
"Task Distribution Visualization": plot_task_distribution,
"Experience Replay Buffer": plot_replay_buffer,
"State Visitation / Occupancy Measure": plot_state_visitation,
"Learning Curve": plot_learning_curve,
"Regret / Cumulative Regret": plot_regret_curve,
"Attention Mechanisms (Transformers in RL)": plot_attention_weights,
"Diffusion Policy": plot_diffusion_policy,
"Graph Neural Networks for RL": plot_gnn_rl,
"World Model / Latent Space": plot_latent_space,
"Convergence Analysis Plots": plot_convergence_log,
"RL Algorithm Taxonomy": plot_rl_taxonomy_tree,
"Probabilistic Graphical Model (RL as Inference)": plot_rl_as_inference_pgm,
"Distributional RL (C51 / Categorical)": plot_distributional_rl_atoms,
"Hindsight Experience Replay (HER)": plot_her_goal_relabeling,
"Dyna-Q Architecture": plot_dyna_q_flow,
"Noisy Networks (Parameter Noise)": plot_noisy_nets_parameters,
"Intrinsic Curiosity Module (ICM)": plot_icm_curiosity,
"V-trace (IMPALA)": plot_v_trace_impala,
"QMIX Mixing Network": plot_qmix_mixing_net,
"Saliency Maps / Attention on State": plot_saliency_heatmaps,
"Action Selection Noise (OU vs Gaussian)": plot_action_selection_noise,
"t-SNE / UMAP State Embeddings": plot_tsne_state_embeddings,
"Loss Landscape Visualization": plot_loss_landscape,
"Success Rate vs Steps": plot_success_rate_curve,
"Hyperparameter Sensitivity Heatmap": plot_hyperparameter_sensitivity,
"Action Persistence (Frame Skipping)": plot_action_persistence,
"MuZero Dynamics Search Tree": plot_muzero_search_tree,
"Policy Distillation": plot_policy_distillation,
"Decision Transformer Token Sequence": plot_decision_transformer_tokens,
"Performance Profiles (rliable)": plot_performance_profiles_rliable,
"Safety Shielding / Barrier Functions": plot_safety_shielding,
"Automated Curriculum Learning": plot_automated_curriculum,
"Domain Randomization": plot_domain_randomization,
"RL with Human Feedback (RLHF)": plot_rlhf_flow,
"Successor Representations (SR)": plot_successor_representations,
"Maximum Entropy IRL": plot_maxent_irl_trajectories,
"Information Bottleneck": plot_information_bottleneck,
"Evolutionary Strategies Population": plot_es_population_distribution,
"Control Barrier Functions (CBF)": plot_cbf_safe_set,
"Count-based Exploration Heatmap": plot_count_based_exploration,
"Thompson Sampling Posteriors": plot_thompson_sampling,
"Adversarial RL Interaction": plot_adversarial_rl_interaction,
"Hierarchical Subgoal Trajectory": plot_hierarchical_subgoals,
"Offline Action Distribution Shift": plot_offline_distribution_shift,
"Random Network Distillation (RND)": plot_rnd_curiosity,
"Batch-Constrained Q-learning (BCQ)": plot_bcq_offline_constraint,
"Population-Based Training (PBT)": plot_pbt_evolution,
"Recurrent State Flow (DRQN/R2D2)": plot_recurrent_state_flow,
"Belief State in POMDPs": plot_belief_state_pomdp,
"Multi-Objective Pareto Front": plot_pareto_front_morl,
"Differential Value (Average Reward RL)": plot_differential_value_average_reward,
"Distributed RL Cluster (Ray/RLLib)": plot_distributed_rl_cluster,
"Neuroevolution Topology Evolution": plot_neuroevolution_topology,
"Elastic Weight Consolidation (EWC)": plot_ewc_elastic_weights,
"Successor Features (SF)": plot_successor_features,
"Adversarial State Noise (Perception)": plot_adversarial_state_noise,
"Behavioral Cloning (Imitation)": plot_behavioral_cloning_il,
"Relational Graph State Representation": plot_relational_graph_state,
"Quantum RL Circuit (PQC)": plot_quantum_rl_circuit,
"Symbolic Policy Tree": plot_symbolic_expression_tree,
"Differentiable Physics Gradient Flow": plot_differentiable_physics_gradient,
"MARL Communication Channel": plot_marl_communication_channel,
"Lagrangian Constraint Landscape": plot_lagrangian_multiplier_landscape,
"MAXQ Task Hierarchy": plot_maxq_task_hierarchy,
"ReAct Agentic Cycle": plot_react_cycle_thinking,
"Synaptic Plasticity RL": plot_synaptic_plasticity_rl,
"Guided Policy Search (GPS)": plot_guided_policy_search_gps,
"Sim-to-Real Jitter & Latency": plot_sim2real_jitter_latency,
"Deterministic Policy Gradient (DDPG) Flow": plot_ddpg_deterministic_gradient,
"Dreamer Latent Imagination": plot_dreamer_latent_rollout,
"UNREAL Auxiliary Tasks": plot_unreal_auxiliary_tasks,
"Implicit Q-Learning (IQL) Expectile": plot_iql_expectile_loss,
"Prioritized Sweeping": plot_prioritized_sweeping,
"DAgger Expert Loop": plot_dagger_expert_loop,
"Self-Predictive Representations (SPR)": plot_spr_self_prediction,
"Joint Action Space": plot_joint_action_space,
"Dec-POMDP Formal Model": plot_dec_pomdp_graph,
"Bisimulation Metric": plot_bisimulation_metric,
"Potential-Based Reward Shaping": plot_reward_shaping_phi,
"Transfer RL: Source to Target": plot_transfer_rl_source_target,
"Multi-Task Backbone Arch": plot_multi_task_backbone,
"Contextual Bandit Pipeline": plot_contextual_bandit_pipeline,
"Theoretical Regret Bounds": plot_regret_bounds_theoretical,
"Soft Q Boltzmann Probabilities": plot_soft_q_heatmap,
"Autonomous Driving RL Pipeline": plot_ad_rl_pipeline,
"Policy action gradient comparison": plot_action_grad_comparison,
"IRL: Feature Expectation Matching": plot_irl_feature_matching,
"Apprenticeship Learning Loop": plot_apprenticeship_learning_loop,
"Active Inference Loop": plot_active_inference_loop,
"Bellman Residual Landscape": plot_bellman_residual_landscape,
"Plan-to-Explore Uncertainty Map": plot_plan_to_explore_map,
"Robust RL Uncertainty Set": plot_robust_rl_uncertainty_set,
"HPO Bayesian Opt Cycle": plot_hpo_bayesian_opt_cycle,
"Slate RL Recommendation": plot_slate_rl_reco_pipeline,
"Fictitious Play Interaction": plot_game_theory_fictitious_play,
"Universal RL Framework Diagram": plot_universal_rl_framework,
"Offline Density Ratio Estimator": plot_offline_density_ratio,
"Continual Task Interference Heatmap": plot_continual_task_interference,
"Lyapunov Stability Safe Set": plot_lyapunov_safe_set,
"Molecular RL (Atom Coordinates)": plot_molecular_rl_atoms,
"MoE Multi-task Architecture": plot_moe_multi_task_arch,
"CMA-ES Policy Search": plot_cma_es_distribution,
"Elo Rating Preference Plot": plot_elo_rating_preference,
"Explainable RL (SHAP Attribution)": plot_shap_lime_attribution,
"PEARL Context Encoder": plot_pearl_context_encoder,
"Medical RL Therapy Pipeline": plot_healthcare_rl_pipeline,
"Supply Chain RL Pipeline": plot_supply_chain_rl,
"Sim-to-Real SysID Loop": plot_sysid_safe_loop,
"Transformer World Model": plot_transformer_world_model,
"Network Traffic RL": plot_network_rl,
"RLHF: PPO with Reference Policy": plot_rlhf_ppo_ref,
"PSRO Meta-Game Update": plot_psro_meta_game,
"DIAL: Differentiable Comm": plot_dial_comm_channel,
"Fitted Q-Iteration Loop": plot_fqi_batch_loop,
"CMDP Feasible Region": plot_cmdp_feasible_set,
"MPC vs RL Planning": plot_mpc_vs_rl_horizon,
"Learning to Optimize (L2O)": plot_l2o_meta_pipeline,
"Smart Grid RL Management": plot_smart_grid_rl,
"Quantum State Tomography RL": plot_quantum_tomography_rl,
"Absolute Universal RL Pillar Map": plot_absolute_encyclopedia_map,
"RL for Chip Placement": plot_chip_placement_rl,
"RL Compiler Optimization (MLGO)": plot_compiler_mlgo,
"RL for Theorem Proving": plot_theorem_proving_rl,
"Diffusion-QL Offline RL": plot_diffusion_ql_loop,
"Fairness-reward Pareto Frontier": plot_fairness_rl_pareto,
"Differentially Private RL": plot_dp_rl_noise,
"Smart Agriculture RL": plot_smart_agriculture_rl,
"Climate Mitigation RL (Grid)": plot_climate_rl_grid,
"AI Education (Knowledge Tracing)": plot_ai_education_tracing,
"Decision SDE Flow": plot_decision_sde_flow,
"Differentiable physics (Brax)": plot_diff_physics_brax,
"Wireless Beamforming RL": plot_beamforming_rl,
"Quantum Error Correction RL": plot_quantum_error_correction_rl,
"Mean Field RL Interaction": plot_mean_field_rl,
"Goal-GAN Curriculum": plot_goal_gan_hrl,
"JEPA: Predictive Architecture": plot_jepa_arch,
"CQL Value Penalty Landscape": plot_cql_penalty_surface,
"Cybersecurity Attack-Defense RL": plot_cyber_attack_defense,
"Causal Inverse RL Graph": plot_causal_irl,
"VQE-RL Optimization": plot_vqe_rl,
"De-novo Drug Discovery RL": plot_drug_discovery_rl,
"Traffic Signal Coordination RL": plot_traffic_signal_coordination,
"Mars Rover Pathfinding RL": plot_mars_rover_pathfinding,
"Sports Player Movement RL": plot_sports_analytics_rl,
"Cryptography Attack RL": plot_crypto_attack_rl,
"Humanitarian Resource RL": plot_humanitarian_rl,
"Video Compression RL (Rate-Distortion)": plot_video_compression_rl,
"Kubernetes Auto-scaling RL": plot_kubernetes_scaling_rl,
"Fluid Dynamics Flow Control RL": plot_fluid_dynamics_rl,
"Structural Optimization RL": plot_structural_optimization_rl,
"Human Decision Modeling (Prospect Theory)": plot_human_decision_rl,
"Semantic Parsing RL": plot_semantic_parsing_rl,
"Melody generation RL": plot_music_melody_rl,
"Plasma Fusion Control RL": plot_plasma_control_rl,
"Carbon Capture RL cycle": plot_carbon_capture_rl,
"Swarm Robotics RL": plot_swarm_robotics_rl,
"Legal Compliance RL Game": plot_legal_compliance_rl,
"Physics-Informed RL (PINN)": plot_pinn_rl_loss,
"Neuro-Symbolic RL": plot_neuro_symbolic_rl,
"DeFi Liquidity Pool RL": plot_defi_liquidity_rl,
"Dopamine Reward Prediction Error": plot_dopamine_rpe_curves,
"Proprioceptive Sensory-Motor RL": plot_proprioceptive_rl_loop,
"Augmented Reality Object Placement RL": plot_ar_placement_rl,
"Sequential Bundle RL": plot_sequential_bundle_rl,
"Online Gradient Descent vs RL": plot_ogd_vs_rl_gradient,
"Active Learning: Query RL Selection": plot_active_learning_selection,
"Federated RL global Aggregator": plot_federated_rl_tree,
"Ultimate Universal RL Mastery Diagram": plot_ultimate_mastery_diagram
}
import sys
for name, func in mapping.items():
# Sanitize filename
filename = re.sub(r'[^a-zA-Z0-9]', '_', name.lower()).strip('_')
filename = re.sub(r'_+', '_', filename) + ".png"
filepath = os.path.join(output_dir, filename)
print(f"Generating: {filename} ...")
plt.close('all')
if func in [plot_reward_landscape, plot_loss_landscape]:
fig = plt.figure(figsize=(10, 8))
gs = GridSpec(1, 1, figure=fig)
func(fig, gs)
plt.savefig(filepath, bbox_inches='tight', dpi=100)
plt.close(fig)
continue
fig, ax = plt.subplots(figsize=(10, 8), constrained_layout=True)
func(ax)
plt.savefig(filepath, bbox_inches='tight', dpi=100)
plt.close(fig)
print(f"\n[SUCCESS] Saved {len(mapping)} graphs to '{output_dir}/' directory.")
if __name__ == "__main__":
import sys
if "--save" in sys.argv:
save_all_graphs()
else:
main()
|