File size: 110,183 Bytes
685d968 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 |
# TINKER DOCUMENTATION
This file contains the complete Tinker documentation and SDK reference.
## Table of Contents
1. Documentation (MDX files)
2. Type Definitions (from tinker.types)
---
# PART 1: DOCUMENTATION
## File: index.mdx
# Tinker: a training API for researchers and developers
Tinker lets you focus on what matters in LLM fine-tuning – your data and algorithms – while we handle the heavy lifting of distributed training.
You write a simple loop that runs on your CPU-only machine, including the data or environment and the loss function. We figure out how to make the training work on a bunch of GPUs, doing the exact computation you specified, efficiently. To change the model you're working with, you only need to change a single string in your code.
Tinker gives you full control over the training loop and all the algorithmic details. It's not a magic black box that makes fine-tuning "easy". It's a clean abstraction that shields you from the complexity of distributed training while preserving your control.
Here's how the division of responsibilities works in practice:
| **You focus on** | **You write** | **We handle** |
|---|---|---|
| 📊 **Datasets and RL environments**<br />Your custom training data | 💻 **Simple Python script**<br />Runs on your CPU | ⚡ **Efficient distributed training of large models**<br />Llama, Qwen, and more |
| 🎯 **Training logic**<br />Your loss functions, training loop, and evals | 🔧 **API calls**<br />`forward_backward()`<br />`optim_step()`<br />`sample()` | 🛡️ **Reliability**<br />Hardware failures handled transparently |
## Features
What the Tinker service currently supports:
- Tinker lets you fine-tune open-weight models like the Qwen and Llama series, including large mixture-of-experts models like Qwen3-235B-A22B.
- Tinker implements low-rank adaptation (LoRA) fine-tuning, not full fine-tuning. However, we believe that LoRA gives the same performance as full fine-tuning for many important use cases, especially in RL (see [LoRA Without Regret](https://thinkingmachines.ai/blog/lora/)).
- You can download the weights of your trained model to use outside of Tinker, for example with your inference provider of choice.
## A quick look at functionality
Tinker's main functionality is contained in a few key functions:
- `forward_backward`: feed in your data and loss function, and we'll compute and accumulate the gradients for you.
- `optim_step`: update your model using the accumulated gradients
- `sample`: Generate outputs from your trained model
- other functions for saving and loading weights and optimizer state
## What's next?
Some features we expect to support in the future:
- Image input for applicable models
- Full fine-tuning
---
## File: losses.mdx
# Loss functions in Tinker
For most use cases, you can use the Tinker API's built-in loss functions by passing in a string identifier to `forward_backward`, which supports cross entropy and policy gradient objectives. When you need more control, `forward_backward_custom` enables arbitrary differentiable loss functions at the cost of an additional forward pass; we explain both approaches in this doc.
When you call `forward_backward`, you specify a loss function using a string that selects from a predetermined set of options, comprising the most common losses used for language model training.
- **Input:** `forward_backward` expects a certain set of input tensors, passed in via `datum.loss_fn_inputs`, which is a dict mapping `str` to either a numpy or torch tensor
- **Output:** `forward_backward` returns a `ForwardBackwardOutput`, which has a set of output tensors in `fwd_bwd_result.loss_fn_outputs`
For an example of using `forward_backward`, see `rl/train.py` in the Cookbook:
```python
async def forward_backward(
training_client: tinker.TrainingClient,
batch_d: List[tinker.Datum],
) -> List[torch.Tensor]:
"""Accumulate gradients on a minibatch of data"""
fwd_bwd_future = await training_client.forward_backward_async(
list(map(remove_mask, batch_d)), loss_fn="importance_sampling"
)
fwd_bwd_result = await fwd_bwd_future.result_async()
# Extract training logprobs from loss_fn_outputs
training_logprobs_D: list[torch.Tensor] = []
for output in fwd_bwd_result.loss_fn_outputs:
training_logprobs = output["logprobs"].to_torch()
training_logprobs_D.append(training_logprobs)
return training_logprobs_D
```
## Basic loss functions
Currently, the Tinker API supports `cross_entropy` (for supervised learning), `importance_sampling` (for RL), and `ppo` (for RL).
All tensors below have shape `(N,)` where `N` is `model_input.length`. They can be provided as `numpy.ndarray` or `torch.Tensor`, and the return values will use the same tensor type.
### Supervised learning: `cross_entropy`
For SL, we implement the standard cross-entropy loss (i.e., negative-log-likelihood), which optimizes the policy $p_\theta$ to maximize the log-probability of the tokens $x$:
$$
\mathcal{L(\theta)} = -\mathbb{E}_x[\log p_\theta(x)]
$$
In practice, this looks like `-(weights * logp(target_tokens)).sum()`, where `weights` is either 0 or 1, typically generated from `renderers.build_supervised_example` (i.e., to specify the desired assistant turns to train on).
- **Input tensors:**
- `target_tokens: array[(N,), int]` - Target token IDs
- `weights: array[(N,), float]` - Token-level loss weights (typically from the renderer)
- **Output tensors:**
- `logprobs: array[(N,), float]` - Log probabilities of predicted tokens
- **Output diagnostics:**
- `loss:sum` (scalar) - Sum of weighted cross-entropy losses
### Policy gradient: `importance_sampling`
For RL, we implement a common variant of the policy gradient objective, used in practical settings where the *learner policy* $p$ may differ from the *sampling policy* $q$, which is common due to e.g. [non-determinism](https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/). To remove the bias caused by this difference, we can use a modified "importance sampling" objective:
$$
\nabla \mathbb{E}_{x\sim p_\theta}\bigl[r(x) \bigr] = \mathbb{E}_{x\sim q}\Bigl[r(x) \cdot \frac{\nabla p_\theta(x)}{q(x)}\Bigr]
$$
which yields the correct expected reward in expectation. This is implemented as `(exp(new_logprobs - logprobs) * advantages).sum()`, where `advantages` may additionally subtract a baseline from the rewards. Note this only works in the bandit setting, which is common in both RLHF and RLVR setups.
- **Input tensors:**
- `target_tokens: array[(N,), int]` - Target token IDs (from the sampler $q$)
- `logprobs: array[(N,), float]` - Reference log probabilities $q$ for the target tokens
- `advantages: array[(N,), float]` - Advantage values for RL
- **Output tensors:**
- `logprobs: array[(N,), float]` - Log probabilities $p$ for the target tokens
- **Output diagnostics:**
- `loss:sum` (scalar) - Sum of importance-weighted policy gradient losses
**Addendum:** Let's consider naively applying the policy gradient objective when $q \neq p$:
$$
\begin{align*}
\mathbb{E}_{x\sim q}\bigr[ r(x) \cdot \nabla \log p_\theta(x) \bigl] &= \sum_x q(x) r(x) \cdot \nabla \log p_\theta(x) \\
&= \sum_x q(x) (r(x) - \bar{r}) \nabla \log p_\theta(x) + \sum_x q(x) \bar{r} \cdot \nabla \log p_\theta(x) \\
&= \mathbb{E}_{x\sim q}\bigl[(r(x) - \bar{r}) \nabla \log p_\theta(x)\bigr] - \bar{r} \cdot \nabla KL(q \Vert p)
\end{align*}
$$
where $\bar{r} = \sum_x q(x) r(x)$, effectively an average-reward baseline.
- The first expectation term resembles a pseudo-policy gradient, increasing the log-likelihood of tokens $x$ which achieve higher-than-average rewards. (It is not an actual policy gradient, because $q \neq p$.)
- The second KL term is effectively a bias term which can destablize RL optimization. This bias increases as either the divergence $KL(q \Vert p)$ grows, or as the average reward $\bar{r}$ shifts.
## Flexible loss functions: `forward_backward_custom`
For use-cases outside of the above, we've provided the more flexible (but slower) methods `forward_backward_custom` and `forward_backward_custom_async` to compute a more general class of loss functions.
### Usage
Here's a simple example of a custom loss function:
```python
def logprob_squared_loss(data: list[Datum], logprobs: list[torch.Tensor]) -> tuple[torch.Tensor, dict[str, float]]:
loss = (logprobs ** 2).sum()
return loss, {"logprob_squared_loss": loss.item()}
```
You can call this loss function with `forward_backward_custom` like:
```python
loss, metrics = training_client.forward_backward_custom(data, logprob_squared_loss)
```
You can also define loss functions which operate on multiple sequences at a time. For example, (although practically useless), a loss function that computes the variance across the sequences can be implemented as:
```python
def variance_loss(data: list[Datum], logprobs: list[torch.Tensor]) -> tuple[torch.Tensor, dict[str, float]]:
flat_logprobs = torch.cat(logprobs)
variance = torch.var(flat_logprobs)
return variance, {"variance_loss": variance.item()}
```
A more practical use case would be to compute a Bradley-Terry loss on pairwise comparison data -- a classic approach in RL from human feedback, as introduced/popularized by [Learning to Summarize](https://arxiv.org/abs/2009.01325). Similarly, we can also implement [Direct Preference Optimization](https://arxiv.org/abs/2305.18290), which also computes a loss involving pairs of sequences; see the [DPO guide](/preferences/dpo-guide) for more details.
If you're using a custom loss function that you think is generally useful, please let us know, and we'll add it to the list of built-in loss functions.
We detail the `async` version of methods in the [Async and Futures](./async) of these docs.
### How `forward_backward_custom` works
---
## File: supervised-learning.mdx
import { CookbookLink } from '../components/CookbookLink'
# Supervised learning
In general, supervised learning means learning an input-output mapping from labeled data. In the context of this documentation, it'll mean that we're minimizing a weighted cross-entropy loss on token sequences, i.e., maximizing the log-probability of the specififed target tokens.
There are a few ways that supervised learning (abbreviated as SL) is commonly used in LLM fine-tuning pipelines:
- *Instruction tuning*: as the first step in post-training pipelines, applied to the *base model*, i.e., *raw pretrained model*. Typically, we do SL on a high-quality dataset that demonstrates the correct format and style, while boosting the model's skills of reasoning and instruction-following.
- *Context distillation* / *prompt distillation*: let's say we have a generic model that can do chat / instruction following / reasoning, but we want to adjust how it behaves in a certain scenario. We can add some instructions to the system message of our model. However, the system message might grow impractically long and start ignoring some of its instructions. So it's often better to create a supervised dataset on a narrow prompt distribution, with a shorter set of instructions that that are targeted at these prompts.
We'll cover these use cases in the documentation and cookbook code.
The library code implementing supervised learning can be found in the <CookbookLink path="tinker_cookbook/supervised">`supervised`</CookbookLink> directory.
---
## File: preferences.mdx
# Preferences
This overview page will cover learning from preferences.
*This is a placeholder page that needs to be written.*
---
## File: lora-primer.mdx
# LoRA Primer
Tinker supports [LoRA fine-tuning](https://arxiv.org/abs/2106.09685), which adjusts a small number of parameters, rather than full fine-tuning, which adjusts all of the parameters of the original model. Our current understanding is that LoRA has equivalent performance to full fine-tuning when doing RL or doing SL on small datasets, while it has worse performance on larger datasets. In more detail:
- For supervised fine-tuning on a small dataset, with fewer completion tokens than the number of LoRA parameters, then LoRA gives roughly the same performance as full fine-tuning.
- For supervised fine-tuning on a large dataset (like continued pre-training, or datasets with billions of tokens), LoRA will significantly underperform full fine-tuning.
- For reinforcement learning fine-tuning, LoRA performs roughly the same as full fine-tuning.
- While it has deficits when fine-tuning on larger datasets, LoRA may also typically better preserves the capabilities of the base model, according to [this study](https://arxiv.org/abs/2405.09673).
See the Connectionism blog post on [LoRA Without Regret](https://thinkingmachines.ai/blog/lora/) for more details and experimental results.
# Hyperparameters
The learning rate (LR) is usually the most important hyperparameter in your ML experiments. LoRA requires a much larger LR than full fine-tuning -- this is a mistkae people often make when porting their code to use LoRA, leading them to conclude that LoRA works poorly. Depending on the model size, the LoRA learning rate might be 20-100 times larger than the full fine-tuning learning rate.
We've provided a utility that calculates the factor you should scale the full fine-tuning LR by to get the equivalent LoRA LR.
```python
from tinker_cookbook.hyperparam_utils import get_lora_lr_over_full_finetune_lr
model_name = "meta-llama/Llama-3.1-8B"
print(get_lora_lr_over_full_finetune_lr(model_name))
```
Note that for `Llama-3.2-1B`, the factor is 32, while for `Llama-3.1-70B`, the factor is 128.
# What is LoRA exactly?
LoRA is short for Low-Rank Adaptation. Given that the original model has a weight matrix $W$, we replace it with a new weight matrix $W'=W + BA$, where $B$ and $A$ are low-rank matrices. If $W$ is an $n \times n$ matrix, then $B$ and $A$ are $n \times r$ and $r \times n$ matrices, respectively, where $r$ is the rank of the low-rank approximation. The default $r$ used by tinker is $32$.
The fact that LoRA uses a low-rank approximation of weight matrices is not terribly important -- we prefer to think of LoRA as just a random projection of the parameter space that happens to be efficient to implement. When training with RL or small SL datasets, we are only learning a small amount of information, and this reduced set of parameters is more than enough.
# What rank to use?
The default rank used by tinker is $32$. However, if you're doing SL on a large dataset, you should use a larger rank. The largest rank currently supported is $128$, so you should use this rank if you are doing SL on a large dataset. Supporting ranks $512$ and higher is on the roadmap.
For supervised learning, as a very rough approximation, LoRA will give good results as long as the number of LoRA parameters is at least as large as the number of completion tokens (i.e., weight=1 tokens). You can calculate the number of LoRA parameters with the following utility:
```python
from tinker_cookbook.hyperparam_utils import get_lora_param_count
model_name = "meta-llama/Llama-3.1-8B"
print(get_lora_param_count(model_name, lora_rank=32))
```
For reinforcement learning, we've found that small ranks give equivalent performance to larger ranks and full fine-tuning.
Note that conveniently, the optimal learning rate does *not* depend on the LoRA rank. In fact, you can verify that if you train with SL on different ranks (but with the same LR), you'll get exactly the same learning curves for the first few steps of training.
---
## File: evals.mdx
import { Callout } from 'nextra/components'
import { CookbookLink } from '../components/CookbookLink'
# Evaluations
Our training scripts will print out training and test loss. Two common workflows for evaluations are to do inline evals during training and to do offline evals on various checkpoints from a run.
## Inline Evals
You can add inline evaluations to your training runs by configuring evaluator builders in advance for both supervised fine-tuning and RL training jobs.
### Supervised Fine-Tuning (`supervised.train`)
Add one or both of the following to your config:
- **`evaluator_builders: list[EvaluatorBuilder]`** - Runs evaluations every `eval_every` steps
- **`infrequent_evaluator_builders: list[EvaluatorBuilder]`** - Runs evaluations every `infrequent_eval_every` steps
### RL Training (`rl.train`)
Add the following to your config:
- **`evaluator_builders: list[SamplingClientEvaluator]`** - Runs evaluations every `eval_every` steps
For implementation guidance and a detailed example, see <CookbookLink path="tinker_cookbook/eval/evaluators.py">here</CookbookLink> and
<CookbookLink path="tinker_cookbook/eval/inspect_evaluators.py">here</CookbookLink> respectively.
## Offline evals
We support and recommend several ways for creating and running your offline evaluations on your model checkpoints.
### Running Standard Evaluations with Inspect AI.
We support running many of the standard cited evaluations using the [Inspect AI library](https://github.com/UKGovernmentBEIS/inspect_ai).
We have provided a <CookbookLink path="tinker_cookbook/eval/run_inspect_evals.py">script</CookbookLink> to evaluate models using Tinker's internal sampling functionality as shown below.
```bash
MODEL_PATH=tinker://FIXME # YOUR MODEL PATH HERE
python -m tinker_cookbook.eval.run_inspect_evals \
model_path=$MODEL_PATH \
model_name=MODEL_NAME \ # YOUR MODEL_NAME HERE
tasks=inspect_evals/ifeval,inspect_evals/mmlu_0_shot \
renderer_name=RENDERER_NAME # YOUR RENDERER_NAME HERE
```
Click [here](https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/docs/evals/listing.yml) to view additional supported evaluations.
### Creating your own Sampling Evaluations
We recommend two ways to create your own evaluations:
- creating your own tasks with Inspect AI and running like above
- creating your own SamplingClientEvaluator
#### Create tasks with Inspect AI
In addition to passing in standard evaluations, you can create your own tasks using inspect ai as detailed [here](https://inspect.aisi.org.uk/tasks.html).
Here is a toy example of how to create an evaluation with an LLM-as-a-judge where we use a model produced by tinker as a grader.
```python
import tinker
from inspect_ai import Task, task
from inspect_ai.dataset import MemoryDataset, Sample
from inspect_ai.model import GenerateConfig as InspectAIGenerateConfig
from inspect_ai.model import Model as InspectAIModel
from inspect_ai.scorer import model_graded_qa
from inspect_ai.solver import generate
from tinker_cookbook.eval.inspect_utils import InspectAPIFromTinkerSampling
QA_DATASET = MemoryDataset(
name="qa_dataset",
samples=[
Sample(
input="What is the capital of France?",
target="Paris",
),
Sample(
input="What is the capital of Italy?",
target="Rome",
),
],
)
service_client = tinker.ServiceClient()
sampling_client = service_client.create_sampling_client(
base_model="meta-llama/Llama-3.1-8B-Instruct"
)
api = InspectAPIFromTinkerSampling(
renderer_name="llama3",
model_name="meta-llama/Llama-3.1-8B-Instruct",
sampling_client=sampling_client,
verbose=False,
)
GRADER_MODEL = InspectAIModel(api=api, config=InspectAIGenerateConfig())
@task
def example_lm_as_judge() -> Task:
"""
Example task using LLM-as-a-judge scoring.
Note: The grader model defaults to the model being evaluated.
To use a different grader model, specify it with --model-grader when using inspect directly.
"""
return Task(
name="llm_as_judge",
dataset=QA_DATASET,
solver=generate(),
scorer=model_graded_qa(
instructions="Grade strictly against the target text as general answer key and rubric. "
"Respond 'GRADE: C' if correct or 'GRADE: I' otherwise.",
partial_credit=False,
# model parameter is optional - if not specified, uses the model being evaluated
model=GRADER_MODEL,
),
)
```
Inspect also natively supports replacing our `GRADER_MODEL` with any openai-chat-completion style api (e.g. openrouter).
#### Create your own SamplingClientEvaluator
Alternatively, you can create your own SamplingClientEvaluator class instead of using Inspect AI. This is a lower
level abstraction than the above with finer-grain control over running your evaluations.
We expose this to interace to allow users more control over their datasets and metrics. To illustrate, see this
<CookbookLink path="tinker_cookbook/eval/custom_evaluators.py">custom evaluators</CookbookLink> example of how one might create their own complex SamplingClientEvaluator.
For a more illustrative toy instructive example see below.
```python
from typing import Any, Callable
import tinker
from tinker import types
from tinker_cookbook import renderers
from tinker_cookbook.evaluators import SamplingClientEvaluator
from tinker_cookbook.tokenizer_utils import get_tokenizer
class CustomEvaluator(SamplingClientEvaluator):
"""
A toy SamplingClientEvaluator that runs a custom evaluation and returns its metrics.
"""
def __init__(
self,
dataset: Any,
grader_fn: Callable[[str, str], bool],
model_name: str,
renderer_name: str,
):
"""
Initialize the CustomEvaluator.
Args:
config: Configuration object containing all evaluation parameters
"""
self.dataset = dataset
self.grader_fn = grader_fn
tokenizer = get_tokenizer(model_name)
self.renderer = renderers.get_renderer(name=renderer_name, tokenizer=tokenizer)
async def __call__(self, sampling_client: tinker.SamplingClient) -> dict[str, float]:
"""
Run custom evaluation on the given sampling client and return metrics.
Args:
sampling_client: The sampling client to evaluate
Returns:
Dictionary of metrics from inspect evaluation
"""
metrics = {}
num_examples = len(self.dataset)
num_correct = 0
sampling_params = types.SamplingParams(
max_tokens=100,
temperature=0.7,
top_p=1.0,
stop=self.renderer.get_stop_sequences(),
)
for datum in self.dataset:
model_input: types.ModelInput = self.renderer.build_generation_prompt(
[renderers.Message(role="user", content=datum["input"])]
)
# Generate response
r: types.SampleResponse = await sampling_client.sample_async(
prompt=model_input, num_samples=1, sampling_params=sampling_params
)
tokens: list[int] = r.sequences[0].tokens
response: renderers.Message = self.renderer.parse_response(tokens)[0]
if self.grader_fn(response["content"], datum["output"]):
num_correct += 1
metrics["accuracy"] = num_correct / num_examples
return metrics
```
Here is an example of how we can use the above CustomEvaluator on a toy dataset and grader.
```python
QA_DATASET = [
{"input": "What is the capital of France?", "output": "Paris"},
{"input": "What is the capital of Germany?", "output": "Berlin"},
{"input": "What is the capital of Italy?", "output": "Rome"},
]
def grader_fn(response: str, target: str) -> bool:
return target.lower() in response.lower()
evaluator = CustomEvaluator(
dataset=QA_DATASET,
grader_fn=grader_fn,
renderer_name="llama3",
model_name="meta-llama/Llama-3.1-8B-Instruct",
)
service_client = tinker.ServiceClient()
sampling_client = service_client.create_sampling_client(base_model="meta-llama/Llama-3.1-8B-Instruct")
async def main():
result = await evaluator(sampling_client)
print(result)
asyncio.run(main())
```
---
## File: dev-tips.mdx
# Developer Tips
## AI-assisted development
We've provided a single-file version of the documentation that can be fed to LLMs for development: see [llms.txt](/llms.txt) and [llms-full.txt](/llms-full.txt).
---
## File: async.mdx
# Async and Futures
## Sync and Async APIs
Every method in the Tinker Python library has both a synchronous (sync) and an asynchronous (async) version. The async variants end with `_async`:
| **Client** | **Sync method** | **Async method** |
|---|---|---|
| `ServiceClient` | `create_lora_training_client()` | `create_lora_training_client_async()` |
| `TrainingClient` | `forward()` | `forward_async()` |
| `SamplingClient` | `sample()` | `sample_async()` |
| `RestClient` | `list_training_run_ids()` | `list_training_run_ids_async()` |
Tinker's `async` functionality requires an `asyncio` event loop, which you typically run like `asyncio.run(main())`.
**When to use each:**
- **Async:** Best for high-performance workflows where you need concurrency, especially when waiting on multiple network calls.
- **Sync:** Simpler for scripts and learning examples. Easier to reason about but blocks on each operation.
The Tinker Cookbook generally uses `async` for implementations where performance is critical and sync for pedagogical examples.
## Understanding Futures
Most Tinker API methods are **non-blocking**, but may take a little while to run. They return immediately with a `Future` object that acknowledges that your request has been submitted. To get the actual result, you must explicitly wait:
**Sync Python:**
```python
result = client.forward_backward_async(data, loss_fn)
result = result.result_async() # Blocks until complete
```
**Async Python (note the double await):**
```python
result = await client.forward_backward_async(data, loss_fn)
result = await result.result_async()
```
After the first `await`, you're guaranteed that the request has been submitted, which ensures that it'll be ordered correctly relative to other requests. The second `await` waits for the actual computation to finish and returns the numerical outputs. For operations like `forward_backward`, the second `await` also guarantees that operation has been applied to the model---for `forward_backward`, this means that the gradients have been accumulated in the model's optimizer state.
## Performance tips: overlap requests
For best performance, you should aim to submit your next request while the current one is running. Doing so is more important with Tinker than with other training systems because Tinker training runs on discrete [clock cycles](./under-the-hood#clock-cycles) (~10 seconds each). If you don't have a request queued when a cycle starts, you'll miss that cycle entirely.
**Example pattern for overlapping requests:**
```python
# Submit first request
future1 = await client.forward_backward_async(batch1, loss_fn)
# Submit second request immediately (don't wait for first to finish)
future2 = await client.forward_backward_async(batch2, loss_fn)
# Now retrieve results
result1 = await future1.result_async()
result2 = await future2.result_async()
```
---
## File: overview-building.mdx
# Overview (Tinker Cookbook)
The next sections provide a variety of guides for how to use the Tinker API for research and applications.
We expect people to use Tinker in a few different ways:
1. You want to define datasets and environments and plug them into existing training code from the Tinker Cookbook.
2. You want to write your own training loops from scratch, starting with the basics.
3. You want to understand the classes and other concepts in Tinker Cookbook so you can extend them to add new functionality.
Different parts of the docs will be tailored to these different approaches.
We'll start with a couple of general pages that'll be relevant to almost all of the use cases:
- [Rendering to Tokens](./rendering.mdx) -- how we convert from a conversation data structure to a list of tokens (a.k.a. chat templates).
- [LoRA Primer](./lora-primer.mdx) -- basic background of LoRA, and how to choose hyperparameters. For most fine-tuning applications, LoRA will give results that are roughly the same as full fine-tuning, however, you need to use different learning rates.
---
## File: save-load.mdx
# Saving and loading weights and optimizer state
During training, you'll need to save checkpoints for two main purposes: *sampling* (to test your model) and *resuming training* (to continue from where you left off). The `TrainingClient` provides three methods to handle these cases:
1. `save_weights_for_sampler()`: saves a copy of the model weights that can be used for sampling.
2. `save_state()`: saves the weights and the optimizer state. You can fully resume training from this checkpoint.
3. `load_state()`: load the weights and the optimizer state. You can fully resume training from this checkpoint.
Note that (1) is faster and requires less storage space than (2).
Both `save_*` functions require a `name` parameter---a string that you can set to identify the checkpoint within the current training run. For example, you can name your checkpoints `"0000"`, `"0001"`, `"step_1000"`, etc.
The return value contains a `path` field, which is a fully-qualified path, which will look something like `tinker://<model_id>/<name>`. This path is persistent and can be loaded later by a new `ServiceClient` or `TrainingClient`.
### Example: Saving for sampling
```python
# Setup
import tinker
service_client = tinker.ServiceClient()
training_client = service_client.create_lora_training_client(
base_model="meta-llama/Llama-3.2-1B", rank=32
)
# Save a checkpoint that you can use for sampling
sampling_path = training_client.save_weights_for_sampler(name="0000").result().path
# Create a sampling client with that checkpoint
sampling_client = service_client.create_sampling_client(model_path=sampling_path) #
```
**Shortcut:** Combine these steps with:
```python
sampling_client = training_client.save_weights_and_get_sampling_client(name="0000")
```
### Example: Saving to resume training
Use `save_state()` and `load_state()` when you need to pause and continue training with full optimizer state preferred:
```python
# Save a checkpoint that you can resume from
resume_path = training_client.save_state(name="0010").result().path
# Load that checkpoint
training_client.load_state(resume_path)
```
### When to use `save_state()` and `load_state()`:
- Multi-step training pipelines (e.g. supervised learning followed by reinforcement learning)
- Adjusting hyperparameters or data mid-run
- Recovery from interruptions or failures
- Any scenario where you need to preserve exact optimizer state (momentum, learning rate schedules, etc.)
---
## File: training-sampling.mdx
import { Callout } from 'nextra/components'
# Getting started with training and sampling
In this guide, we'll step you through using the Tinker Python library to do the basic operations needed for training and sampling.
[View the complete Python script →](/quickstart.py.txt)
## Creating the training client
The main object we'll be using is the `TrainingClient`, which corresponds to a fine-tuned model that we can train and sample from.
First, set your Tinker API key environment variable. In the terminal where you'll run Python, or in your `.bashrc`, put `export TINKER_API_KEY=<your key>`.
Then, create a `ServiceInterface`. This lets you find out what base models are available to be fine-tuned.
```python
import tinker
service_client = tinker.ServiceClient()
print("Available models:")
for item in service_client.get_server_capabilities().supported_models:
print("- " + item.model_name)
```
You'll see a list of model names:
```
- meta-llama/Llama-3.1-70B
- meta-llama/Llama-3.1-8B
...
- Qwen/Qwen3-235B-A22B-Instruct-2507
- Qwen/Qwen3-30B-A3B-Base
```
We currently support models from the Qwen3 and Llama3 series. We're going to use Qwen3-30B-A3B-Base (a raw pre-trained model) for these examples. See [Available Models in Tinker](/model-lineup) for the full list.
Now we can create the `TrainingClient`:
```python
base_model = "Qwen/Qwen3-30B-A3B-Base"
training_client = service_client.create_lora_training_client(
base_model=base_model
)
```
Note that we've specified a *base model*, which is the model we'll initialize from. In this case, it's a raw pre-trained model, but most of the "base models" in Tinker are fine-tuned for chat/instruction-following. You should check the details of the model you're using in their system cards.
## Preparing the training data
Now we can do training updates on the model. This quickstart example won't show best practices for LLM fine-tuning; it's just an API demo. Check out [Rendering](/rendering), [Supervised Fine-tuning](/supervised-learning) and the other Cookbook examples for guidance on how to use Tinker in real applications.
For this model, we'll train a model that can translate words into Pig Latin. The rules for Pig Latin are simple:
- If a word begins with a consonant, move it to the end and add "ay"
- If a word begins with a vowel, just add "way" to the end
Here are some example completions we'd like the model to perform, where the prompt is in green and the model's completion is in red:
<div className="example">
<span className="prompt">English: hello world<br/>
Pig Latin: </span><span className="completion">ello-hay orld-way</span>
</div>
Let's create some training examples and convert them to a format expected by Tinker.
```python
# Create some training examples
examples = [
{
"input": "banana split",
"output": "anana-bay plit-say"
},
{
"input": "quantum physics",
"output": "uantum-qay ysics-phay"
},
{
"input": "donut shop",
"output": "onut-day op-shay"
},
{
"input": "pickle jar",
"output": "ickle-pay ar-jay"
},
{
"input": "space exploration",
"output": "ace-spay exploration-way"
},
{
"input": "rubber duck",
"output": "ubber-ray uck-day"
},
{
"input": "coding wizard",
"output": "oding-cay izard-way"
},
]
# Convert examples into the format expected by the training client
from tinker import types
# Get the tokenizer from the training client
tokenizer = training_client.get_tokenizer()
def process_example(example: dict, tokenizer) -> types.Datum:
# Format the input with Input/Output template
# For most real use cases, you'll want to use a renderer / chat template,
# (see later docs) but here, we'll keep it simple.
prompt = f"English: {example['input']}\nPig Latin:"
prompt_tokens = tokenizer.encode(prompt, add_special_tokens=True)
prompt_weights = [0] * len(prompt_tokens)
# Add a space before the output string, and finish with double newline
completion_tokens = tokenizer.encode(f" {example['output']}\n\n", add_special_tokens=False)
completion_weights = [1] * len(completion_tokens)
tokens = prompt_tokens + completion_tokens
weights = prompt_weights + completion_weights
input_tokens = tokens[:-1]
target_tokens = tokens[1:] # We're predicting the next token, so targets need to be shifted.
weights = weights[1:]
# A datum is a single training example for the loss function.
# It has model_input, which is the input sequence that'll be passed into the LLM,
# loss_fn_inputs, which is a dictionary of extra inputs used by the loss function.
return types.Datum(
model_input=types.ModelInput.from_ints(tokens=input_tokens),
loss_fn_inputs=dict(weights=weights, target_tokens=target_tokens)
)
processed_examples = [process_example(ex, tokenizer) for ex in examples]
# Visualize the first example for debugging purposes
datum0 = processed_examples[0]
print(f"{'Input':<20} {'Target':<20} {'Weight':<10}")
print("-" * 50)
for i, (inp, tgt, wgt) in enumerate(zip(datum0.model_input.to_ints(), datum0.loss_fn_inputs['target_tokens'].tolist(), datum0.loss_fn_inputs['weights'].tolist())):
print(f"{repr(tokenizer.decode([inp])):<20} {repr(tokenizer.decode([tgt])):<20} {wgt:<10}")
```
The visualization of the first example is:
```
Input Target Weight
--------------------------------------------------
'English' ':' 0.0
':' ' I' 0.0
' I' ' love' 0.0
' love' ' tink' 0.0
' tink' 'ering' 0.0
'ering' '\n' 0.0
'\n' 'P' 0.0
'P' 'ig' 0.0
'ig' ' Latin' 0.0
' Latin' ':' 0.0
':' ' I' 1.0
' I' '-way' 1.0
'-way' ' o' 1.0
' o' 've' 1.0
've' '-l' 1.0
'-l' 'ay' 1.0
'ay' ' ink' 1.0
' ink' 'ering' 1.0
'ering' '-t' 1.0
'-t' 'ay' 1.0
'ay' '<|endoftext|>' 1.0
```
## Performing a training update
Now we can use this data to perform a training update. We'll do 6 updates on the same batch of data. (Note that this is not typically a good way to train!)
```python
import numpy as np
for _ in range(6):
fwdbwd_future = training_client.forward_backward(processed_examples, "cross_entropy")
optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4))
# Wait for the results
fwdbwd_result = fwdbwd_future.result()
optim_result = optim_future.result()
# fwdbwd_result contains the logprobs of all the tokens we put in. Now we can compute the weighted
# average log loss per token.
logprobs = np.concatenate([output['logprobs'].tolist() for output in fwdbwd_result.loss_fn_outputs])
weights = np.concatenate([example.loss_fn_inputs['weights'].tolist() for example in processed_examples])
print(f"Loss per token: {-np.dot(logprobs, weights) / weights.sum():.4f}")
```
Note that the `forward_backward` and `optim_step` functions immediately return *futures*, which acknowledge that the task has been queued up by the server. For improved speed, we submitted both operations before waiting for the result by calling `result()` on the futures.
## Sampling from the model
Now we can test our model by sampling from it. In this case, we'll translate the phrase "coffee break" into Pig Latin.
```python
# First, create a sampling client. We need to transfer weights
sampling_client = training_client.save_weights_and_get_sampling_client(name='pig-latin-model')
# Now, we can sample from the model.
prompt=types.ModelInput.from_ints(tokenizer.encode("English: coffee break\nPig Latin:"))
params = types.SamplingParams(max_tokens=20, temperature=0.0, stop=["\n"]) # Greedy sampling
future = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=8)
result = future.result()
print("Responses:")
for i, seq in enumerate(result.sequences):
print(f"{i}: {repr(tokenizer.decode(seq.tokens))}")
```
Since sampling is nondeterministic (sadly, even with temperature=0.0, [due to batching](https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/)), the output will be different each time. You should see something like this:
```
Responses:
0: ' offe-bay eak-bay\n\n'
1: ' offey-coy eak-bray\n\n'
2: ' offecay eakbray\n\n'
3: ' offeec-cay eak-brcay\n\n\n'
4: ' offecay akebay\n\n'
5: ' offee-Cay ake-bay\n\n\n'
6: ' offey-pay eak-bray\n\n'
7: ' offee – cay eak – bray\n\n'
```
---
## File: rendering.mdx
import { CookbookLink } from '../components/CookbookLink'
# Rendering
Rendering is the process of converting list-of-message datatypes into their token representations and is roughly the same as [chat templates](https://huggingface.co/docs/transformers/en/chat_templating); however, the chat template functionality in other libraries is only well-suited for inference, whereas this library provides a more complete interface that handles supervised and reinforcement learning.
## Renderer Class
The Renderer class is the main interface used for rendering and can be found in <CookbookLink path="tinker_cookbook/renderers.py">`renderers.py`</CookbookLink>.
Let's use the following working example of a conversation between a user and an assistant.
```python
messages =[
{'role': 'system', 'content': 'Answer concisely; at most one sentence per response'},
{'role': 'user', 'content': 'What is the longest-lived rodent species?'},
{'role': 'assistant', 'content': 'The naked mole rat, which can live over 30 years.'},
{'role': 'user', 'content': 'How do they live so long?'},
{'role': 'assistant', 'content': 'They evolved multiple protective mechanisms including special hyaluronic acid that prevents cancer, extremely stable proteins, and efficient DNA repair systems that work together to prevent aging.'}
]
```
## Generating messages
Our model maps tokens to tokens, but with the renderer, it can map messages to messages. To sample messages from the model, we need to use three methods from the renderer:
- `build_generation_prompt`
- `get_stop_sequences`
- `parse_response`
`build_generation_prompt` converts a conversation into a prompt that we can use to sample from the assistant. This is used during reinforcement learning and at deployment time.
Let's remove the last assistant message and call `build_generation_prompt` to get a prompt that we can use to sample an alternative response from the assistant.
```python
from tinker_cookbook import renderers, tokenizer_utils
tokenizer = tokenizer_utils.get_tokenizer('Qwen/Qwen3-30B-A3B')
renderer = renderers.get_renderer('qwen3', tokenizer)
prompt = renderer.build_generation_prompt(messages[:-1])
print(prompt)
print('-'*10)
print(tokenizer.decode(prompt.to_ints()))
```
First you can see that the prompt is a `ModelInput` object, which is a list of `EncodedTextChunk` objects (but contains different objects in multi-modal data).
```
ModelInput(chunks=[EncodedTextChunk(tokens=[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 8948, 198, 16141, 3529, 285, 974, 26, 518, 1429, 825, 11652, 817, 2033, 151645, 198, 151644, 872, 198, 3838, 374, 279, 22032, 61854, 20589, 306, 9419, 30, 151645, 198, 151644, 77091, 198, 785, 19020, 34651, 11244, 11, 892, 646, 3887, 916, 220, 18, 15, 1635, 13, 151645, 198, 151644, 872, 198, 10234, 30, 151645, 198, 151644, 77091, 198], type='encoded_text')])
----------
<|im_start|>system
Answer concisely; at most one sentence per response<|im_end|>
<|im_start|>user
What is the longest-lived rodent species?<|im_end|>
<|im_start|>assistant
The naked mole rat, which can live over 30 years.<|im_end|>
<|im_start|>user
How do they live so long?<|im_end|>
<|im_start|>assistant
```
Given that we're providing messages as input, we probably want a message output, rather than a token output. For that, we can use `parse_response`.
```python
import tinker
from tinker.types import SamplingParams
service_client = tinker.ServiceClient()
sampling_client = service_client.create_sampling_client(base_model='Qwen/Qwen3-30B-A3B')
stop_sequences = renderer.get_stop_sequences()
print(f"Stop sequences: {stop_sequences}")
sampling_params = SamplingParams(max_tokens=100, temperature=0.5, stop=stop_sequences)
output = sampling_client.sample(prompt, sampling_params=sampling_params, num_samples=1).result()
print(f"Sampled tokens: {output.sequences[0].tokens}")
sampled_message, parse_success = renderer.parse_response(output.sequences[0].tokens)
print(f"Sampled message: {sampled_message}")
print(f"Parse success: {parse_success}")
```
We get the following output:
```
Stop sequences: [151645]
Sampled tokens: [45, 7741, 34651, 31410, 614, 4911, 76665, 11, 2670, 264, 7548, 11050, 22077, 1849, 323, 264, 1602, 3347, 40761, 4379, 11, 892, 16792, 311, 862, 57119, 13, 151645]
Sampled message: {'role': 'assistant', 'content': 'Naked mole rats have unique adaptations, including a highly efficient immune system and a very low metabolic rate, which contribute to their longevity.'}
Parse success: True
```
You can see that the there is one stop sequence, 151645, which you can verify is the `<|im_end|>` token. The output is parsed successfully into a message.
## Supervised learning
For supervised learning, along with some other algorithms like [DPO](/preferences/dpo-guide), we need different information from the renderer -- we want to provide a target assistant message, and the renderer needs to tell us which tokens are part of the prompt and completion.
To do this, we can use `build_supervised_example` as follows:
```python
tokens, weights = renderer.build_supervised_example(messages)
from tinker_cookbook.utils.format_colorized import format_colorized
print(format_colorized(tokens, weights, tokenizer))
```
We get the following output:
<div className="example">
<span className="prompt"><|im_start|>system↵<br />Answer concisely; at most one sentence per response<|im_end|>↵<br /><|im_start|>user↵<br />What is the longest-lived rodent species?<|im_end|>↵<br /><|im_start|>assistant↵<br />The naked mole rat, which can live over 30 years.<|im_end|>↵<br /><|im_start|>user↵<br />How do they live so long?<|im_end|>↵<br /><|im_start|>assistant↵<br /></span>
<span className="completion">They evolved multiple protective mechanisms including special hyaluronic acid that prevents cancer, extremely stable proteins, and efficient DNA repair systems that work together to prevent aging.<|im_end|><br /></span>
</div>
The green text is part of the prompt (i.e. with `weight=0`) and red is part of the completion (i.e. with `weight=1`). Note that the ↵ have been inserted just to make it clear whether each newline is part of the prompt or the completion; these are not actually part of the token sequence.
## Appendix: why don't you just use jinja-formatted chat templates?
In our experience, the Jinja2 templates are harder to write than python code -- especially when we need to get the whitespace exactly right. They are also unwieldy for supervised learning, where you need to put different labels on different tokens.
---
## File: completers.mdx
import { CookbookLink } from '../components/CookbookLink'
# Completers
The concept of policies is crucial to the RL training process. In the Tinker Cookbook, policies are implemented as `Completers`. Completers are abstractions that represent models or policies that can be sampled from, providing different levels of structure depending on your use case.
## Overview of Completer Types
The Tinker Cookbook provides two main types of completers, each designed for different use cases:
1. **TokenCompleter**: Operates on tokens and is used by RL algorithms
2. **MessageCompleter**: Operates on messages and needs to be used with a renderer
The choice between these depends on whether you're working at the token level for RL training or at the message level for interacting with and evaluating the model.
### TokenCompleter
The `TokenCompleter` is the foundational interface used by RL algorithms because they work directly with tokens.
```python
class TokenCompleter:
async def __call__(
self, model_input: types.ModelInput, stop: StopCondition
) -> TokensWithLogprobs:
```
This interface takes:
- `model_input`: The input to the model (of type `types.ModelInput`)
- `stop`: Stop conditions, either a list of strings or token IDs (combined into a `StopCondition` class). When training with reinforcement learning, this should be defined by the `initial_observation` function of the environment.
It returns a `TokensWithLogprobs` object containing:
- `tokens`: The generated token sequence
- `maybe_logprobs`: Optional log probabilities for each token
### MessageCompleter
The `MessageCompleter` operates at a higher level with structured messages, similarly to standard chat APIs. It takes a list of messages and returns a single assistant message response.
```python
class MessageCompleter:
async def __call__(self, messages: list[renderers.Message]) -> renderers.Message:
```
For training purposes the `TokenCompleter` is the class we will use for RL training as we need to optimiza the same same set of tokens during the update step that the model output during rollout. The `MessageCompleter` is useful for sampling where we need to use the model output for semantic purposes such as Judge models or multi-agent environments.
The Tinker Cookbook uses two concrete implementations of these interfaces - <CookbookLink path="tinker_cookbook/completers.py">`TinkerTokenCompleter`</CookbookLink> and <CookbookLink path="tinker_cookbook/completers.py">`TinkerMessageCompleter`</CookbookLink> which are both wrappers around a `tinker.SamplingClient`. While the TinkerTokenCompleter operates directly on tokens, the TinkerMessageCompleter needs to be instantiated with a renderer to make it compatible with the inputs expected by the samping client.
---
## File: install.mdx
# Installing Tinker
There are two packages to install:
- [tinker](https://github.com/thinking-machines-lab/tinker), which performs basic low-level operations like `forward_backward`
- [tinker-cookbook](https://github.com/thinking-machines-lab/tinker-cookbook), which is a collection of training code and experiment tools built on top of Tinker
Both packages can be installed with `pip` from their GitHub repositories:
```bash
pip install git+https://github.com/thinking-machines-lab/tinker.git
pip install git+https://github.com/thinking-machines-lab/tinker-cookbook.git
```
However, for the Cookbook, we'd recommend doing a local editable install, as you'll probably want to browse and edit the code:
```bash
git clone https://github.com/thinking-machines-lab/tinker-cookbook.git
cd tinker-cookbook
pip install -e .
```
Tinker Cookbook also has a few optional dependencies, which you can install with the following command in an editable install:
```bash
pip install -e ".[envs]"
```
Or, for a non-editable install:
```bash
pip install "git+https://github.com/thinking-machines-lab/tinker-cookbook.git#egg=tinker-cookbook[envs]"
```
---
## File: rl.mdx
import { CookbookLink } from '../components/CookbookLink'
# Reinforcement learning
Reinforcement learning (RL) means learning from trial and error. Whereas in supervised learning, we're given input-output pairs, in RL, we're given inputs (prompts) and reward functions (i.e., a function for scoring candidate outputs). RL algorithms need to discover what good outputs look like.
Here are a few different types of RL training that we support in the Tinker Cookbook:
- *RL with Verifiable Rewards*: this is when we do RL on a reward function that checks model outputs using a program. Typically, the reward function checks the candidate answer against a reference answer, or, in coding cases, it may check if the candidate solution passes some unit tests. RLVR is especially suitable for teaching models to do reasoning (with chain-of-thought) and multi-step tool use (e.g., debugging and iterative modification pf programs).
- *RL on Human Feedback*: here, we assume we have an objective that can't be calculated by a simple program, and it requires some human judgement. For example, we typically want to optimize our models for helpfulness, which includes being clear, informative, and interesting. For RLHF, we train a *preference model* using supervised learning to match human judgement, scoring or ranking candidate outputs. Then we do RL on the preference model's scores. See the [Preferences](/preferences) section for more details.
We'll first show how to do small RL runs in the RLVR setting, then we'll show you how to define your own RL environments and train on them, then we'll provide examples for larger-scale or more complicated training setups.
We anticipate that people will want to use tinker for RL in a few different ways:
- Creating a specialist model that's SoTA at a specific skill, which existing models haven't been trained on. In this case, you'll want to start with a post-trained model that's already strong, and then do RL on an environment you've defined. See [RL Environments](/rl/rl-envs).
- Doing research on post-training pipelines. In this case, you'll probably want to chain together SL and RL and runs with different data mixes, environments, and reward functions. See our [RLHF example](/preferences/rlhf-example).
- Doing research on RL algorithms. Here, you'll probably want to find some existing environments to use as benchmarks, and either modify our provided training code (<CookbookLink path="tinker_cookbook/rl/train.py">rl/train.py</CookbookLink>) or write your own minimal training loop. We've provided a [minimal training loop](/rl/rl-loops) that you can use as a starting point.
---
## File: model-lineup.mdx
# Available Models in Tinker
The table below shows the models that are currently available in Tinker. We plan to update this list frequently as new models are released.
## What model should I use?
- In general, use MoE models, which are more cost effective than the dense models.
- Use 🐙 Base models only if you're doing research or are running the full post-training pipeline yourself
- If you want to create a model that is good at a specific task or domain, use an existing post-trained model model, and fine-tune it on your own data or environment.
- If you care about latency, use one of the "⚡ Instruction" models, which will start outputting tokens without a chain-of-thought.
- If you care about intelligence and robustness, use one of the "🤔 Hybrid" models, which can use long chain-of-thought.
## Full Listing
| Model Name | Training Type | Architecture | Size |
|------------|--------------|--------------|------|
| [Qwen/Qwen3-235B-A22B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507) | ⚡ Instruction | 🔀 MoE | 🦖 Large |
| [Qwen/Qwen3-30B-A3B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507) | ⚡ Instruction | 🔀 MoE | 🦅 Medium |
| [Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B) | 🤔 Hybrid | 🔀 MoE | 🦅 Medium |
| [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | 🐙 Base | 🔀 MoE | 🦅 Medium |
| [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | 🤔 Hybrid | 🧱 Dense | 🦅 Medium |
| [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) | 🤔 Hybrid | 🧱 Dense | 🦆 Small |
| [Qwen/Qwen3-8B-Base](https://huggingface.co/Qwen/Qwen3-8B-Base) | 🐙 Base | 🧱 Dense | 🦆 Small |
| [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507) | ⚡ Instruction | 🧱 Dense | 🐣 Compact |
| [meta-llama/Llama-3.3-70B](https://huggingface.co/meta-llama/Llama-3.3-70B) | ⚡ Instruction | 🧱 Dense | 🦖 Large |
| [meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B) | 🐙 Base | 🧱 Dense | 🦖 Large |
| [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) | 🐙 Base | 🧱 Dense | 🦆 Small |
| [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | ⚡ Instruction | 🧱 Dense | 🦆 Small |
| [meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B) | 🐙 Base | 🧱 Dense | 🐣 Compact |
| [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) | 🐙 Base | 🧱 Dense | 🐣 Compact |
## Legend
### Training Types
- 🐙 **Base**: Foundation models trained on raw text data, suitable for post-training research and custom fine-tuning
- ⚡ **Instruction**: Models fine-tuned for following instructions and chat, optimized for fast inference
- 🤔 **Hybrid**: Models that can operate in both thinking and non-thinking modes
### Architecture
- 🧱 **Dense**: Standard transformer architecture with all parameters active
- 🔀 **MoE**: Mixture of Experts architecture with sparse activation
### Model Sizes
- 🐣 **Compact**: 1B-4B parameters
- 🦆 **Small**: 8B parameters
- 🦅 **Medium**: 30B-32B parameters
- 🦖 **Large**: 70B+ parameters
Note that the MoE models are much more cost effective than the dense models. E.g. the Qwen3-30B-A3B model has only 3B active parameters, so it'll cost around the same as a 3B dense model for training and inference.
---
## File: preferences/dpo-guide.mdx
import { Callout } from 'nextra/components'
import { CookbookLink } from '../../components/CookbookLink'
# Direct Preference Optimization (DPO)
Direct Preference Optimization (DPO) is a method for training language models to align with human preferences without requiring a separate reward model. Instead of using reinforcement learning with human feedback (RLHF), DPO directly optimizes the model to prefer chosen responses over rejected ones using a simple classification loss.
## DPO Algorithm Details
The core DPO loss is computed as:
$$
\mathcal{L}_{\theta} = -\mathbb{E}_{x, y_\text{chosen}, y_\text{rejected} \sim \mathcal{D}}\left[\log\sigma\left(\beta\log \frac{\pi_{\theta}(y_\text{chosen}|x)}{\pi_{\text{ref}}(y_\text{chosen}|x)} - \beta\log \frac{\pi_{\theta}(y_\text{rejected}|x)}{\pi_{\text{ref}}(y_\text{rejected}|x)}\right)\right]
$$
Where:
- $\pi_{\theta}$ is the current policy
- $\pi_{\text{ref}}$ is the reference model (typically the initial model before DPO training)
- $\beta$ is the DPO beta parameter
- Where $\mathcal{D}$ is a dataset of prompts $x$, a chosen response $y_{\text{chosen}}$ and a rejected response $y_{\text{rejected}}$
This optimizes the classical constrianed RLHF objective, where the reference model constrains deviation from the initial distribution.
<Callout type="info">
**DPO vs RLHF**: DPO eliminates the need for a separate reward model by directly optimizing the policy to prefer chosen responses. This makes training simpler and computationally cheaper than classical RLHF.
</Callout>
## Running DPO Training
The implementation is in <CookbookLink path="train_dpo.py">train_dpo.py</CookbookLink> with a CLI interface in <CookbookLink path="train_dpo_cli.py">train_dpo_cli.py</CookbookLink>. You can run it from the command line:
```bash
python -m tinker_cookbook.preference.train_dpo_cli \
log_path=/tmp/dpo-hhh-experiment \
model_name=meta-llama/Llama-3.2-1B \
dataset=hhh \
renderer_name=role_colon \
learning_rate=1e-5 \
dpo_beta=0.1
```
### Key Parameters
- `log_relpath`: Directory where results and checkpoints are saved
- `model_name`: Base model used as initialization and for the reference policy
- `dataset`: Dataset name (`hhh`, `helpsteer3`, `ultrafeedback`)
- `renderer_name`: How conversations are formatted (see [Rendering](../rendering.mdx))
- `learning_rate`: Learning rate for optimization
- `dpo_beta`: DPO beta parameter (controls the strength of preference learning)
### Available Datasets
There are several pre-defined datasets:
- **`hhh`**: Anthropic's Helpful-Harmless-Honest dataset
- **`helpsteer3`**: NVIDIA's HelpSteer3 preference dataset
- **`ultrafeedback`**: UltraFeedback binarized preferences dataset
These are implemented as `DPODatasetBuilder` classes and you can implement a custom dataset builder following the `tinker_cookbook.preference.preference_datasets` interface.
## Training Process
During training, you'll see output like this showing the DPO metrics:
```
Step 50
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ Metric ┃ Value ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ accuracy │ 0.568627 │
│ batch_time │ 27.953704 │
│ chosen_reward │ 0.053621 │
│ dpo_loss │ 0.683825 │
│ learning_rate │ 0.000009 │
│ margin │ 0.002147 │
│ num_pairs │ 255 │
│ num_tokens │ 112638 │
│ progress │ 0.081210 │
│ rejected_reward │ 0.032152 │
│ test/nll │ 1.871778 │
└────────────────────────────────┴───────────┘
```
The key metrics are:
- **`dpo_loss`**: The DPO classification loss
- **`accuracy`**: Accuracy of the implicit reward model evaluated on the preference dataset
- **`margin`**: Average difference between chosen and rejected rewards
- **`chosen_reward`/`rejected_reward`**: Average rewards for chosen/rejected responses
## Evaluating DPO Models
After training, you can evaluate your DPO model using the inspect evaluation framework:
```bash
MODEL_PATH=tinker://YOUR_MODEL_PATH_HERE
python -m tinker_cookbook.run_inspect_evals \
model_path=$MODEL_PATH \
model_name=meta-llama/Llama-3.2-1B \
tasks=ifeval,paws \
renderer_name=role_colon
```
This will evaluate the model on various benchmarks to measure the impact of preference optimization.
## Tips for DPO Training
1. **Beta Parameter**: Start with `dpo_beta=0.1` and adjust based on your dataset.
2. **Learning Rate**: Use a lower learning rate than supervised fine-tuning (typically 1e-5 to 1e-6).
3. **Base Model**: The base model should already be in-distribution with the preference data. Either start with a ligh SFT phase or collect on-policy preferences. While training would still work. sharp distribution mis-match will create strange model behaviors.
---
## File: preferences/rlhf-example.mdx
import { CookbookLink } from '../../components/CookbookLink'
# Reinforcement Learning from Human Feedback
## Train a pairwise preference model
We can first use supervised learning to train a model to do pairwise comparisons. To do this, we format the comparisons such that the model sees the prompt and the two completions, and it learns to predict the label (which was was preferred). Here, we train on the [Anthropic HHH](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset.
```bash
model_name=meta-llama/Llama-3.1-8B-Instruct
python -m tinker_cookbook.supervised.train_cli \
log_path=/tmp/tinker-examples/hhh-pm \
dataset=hhh \
model_name=$model_name \
learning_rate=4e-4
```
After the training is done, we can make note of the path of the final checkpoint. This path is printed to the console, and it's also in the `metrics.jsonl` file.
## Do RL using the reward model
Next, we can run RL using this reward model. In <CookbookLink path="tinker_cookbook/rl/train_cli.py">rl/train_cli.py</CookbookLink>, we've hardcoded the tinker path of a Llama-8B-Instruct model that was we trained on the dataset above. Here, we define the reward function by generating 4 completions per prompt, and doing 4x3=12 pairwise comparisons between these completions, using the reward model.
```bash
python -m tinker_cookbook.rl.train_cli \
env=hhh \
log_path=/tmp/tinker-examples/hhh-rl \
groups_per_batch=256 \
group_size=4 \
max_tokens=400 \
learning_rate=2e-5
```
TODO: evaluate with reward model and inspect evals
---
## File: rl/rl-basic.mdx
import { CookbookLink } from '../../components/CookbookLink'
# Your First RL Run
We've provided a minimal script that runs RL on the [GSM8K dataset](https://huggingface.co/datasets/openai/gsm8k): <CookbookLink path="tinker_cookbook/recipes/rl_basic.py">rl_basic.py</CookbookLink>. For running this script, tinker-cookbook needs to be installed with optional envs dependencies (`pip install -e ".[envs]"`). You can run the minimal RL script from the command line as follows:
```bash
python -m tinker_cookbook.recipes.rl_basic
```
This script will fine-tune the Llama-3.1-8B base (pretrained) model on this dataset with the following reward function:
$$
1[\text{answer is correct}] + 0.1 \times (1[\text{answer is formatted correctly}] - 1)
$$
The training should take about 1 minute per iteration and climb to about 63% accuracy after 15 iterations (`env/all/correct`). You can look at the printouts for some other metrics of interest:
- `ac_tokens_per_turn`: the number of each tokens in each generated completion
- `env/all/format`: the fraction of completions that are formatted correctly
- `env/all/reward/total`: mean total reward (combining format and correctness as defined above)
- `entropy`: per-token entropy (mean negative log-probability of sampled tokens)
- `kl_sample_train_{v1,v2}`: two different approximations/estimators of KL divergence between the sampler's and learner's probability distribution (contributed to by numerical differences and rounding noise)
- `progress/done_frac`: what fraction of the total number of iterations we've completed so far
- `time/...`: time for different parts of the training loop
You can also look at the `log_path` directory for more detailed metrics. There are several files of interest, which are mostly the same as in the [Supervised Learning](/supervised-learning/sl-basic) case.
---
## File: rl/rl-hyperparams.mdx
# RL Hyperparameters
This guide covers the key hyperparameters for reinforcement learning training, from core settings to advanced configurations.
## Core Hyperparameters
### Learning Rate
Similar to the [supervised learning setting](../supervised-learning/sl-hyperparams), the learning rate is the most critical hyperparameter choice. We recommend using the guidance presented there as a starting point for RL experiments as well.
### Batch and Group Sizes
As described in our [RL environments](../rl/rl-envs.mdx) documentation, we use two key parameters:
- **`batch_size`**: The number of unique environments or problems used for training
- **`group_size`**: The number of rollouts performed per unique environment
If you have limited environments or problems available for training, increase the `group_size` to generate more training data. While the total number of rollouts depends on both parameters, we recommend scaling learning rates proportionally to $\text{LR} \propto \sqrt{\text{batch\_size}}$.
## Multiple Updates per Sampling Iteration
The `num_substeps` parameter controls how many policy weight updates are performed on data sampled from the last policy iteration, similar to PPO and GRPO.
### How it works:
- **`num_substeps = 1` (default)**: Each batch of collected trajectories is used for exactly one optimizer update
- **`num_substeps > 1`**: The batch of unique environments is split into `num_substeps` mini-batches, where each environment/problem has `group_size` rollouts (we pack all rollouts for a particular environment/problem in the same minibatch). We do a single update step on each mini-batch. Note that our implementation still takes only a single epoch through the data.
### Usage Guidelines:
- The batch size must be divisible by `num_substeps`
- Our experiments show that `num_substeps = 1` already gives decent performance, but if you would like to experiment with this parameter, we recommend starting with a low value of 2-4 and using the PPO objective.
- Higher values can lead to update steps that are too out-of-distribution for the policy. Consider limiting the number of updates or decreasing the learning rate when using multiple update steps.
## Advanced Training Configurations
⚠️ **Note**: These features are experimental and may be subject to instabilities. They are currently disabled by default.
### Streaming Minibatch Training
Enable streaming minibatch training by specifying the `StreamMinibatchConfig`. This approach overlaps trajectory sampling and model training, improving overall throughput by submitting training requests as soon as enough rollouts complete, without waiting for all sampling jobs to finish.
**Configuration Parameters:**
- **`groups_per_batch`**: Same as batch size (TODO: consider getting this from the dataset builder, also general naming around these is not great now)
- **`num_minibatches`**: Number of minibatches per substep—controls how many individual forward-backward requests we submit. This controls how the work is split (TODO: Why do we need this? We should just send a group as soon as it's ready).
**Important**: This remains on-policy training and is strictly a pipeline efficiency improvement.
### Async Off-Policy Training
Async training allows the model to train on trajectories generated with slightly older model versions, enabling higher throughput at the cost of some off-policy bias. While Tinker doesn't currently support in-flight weight changes, it supports the "off-by-K" async RL approach where multiple model iterations generate data simultaneously. Configure this by setting the `AsyncConfig` object.
**Configuration Parameters:**
- **`max_steps_off_policy`**: Maximum age (in training steps) of trajectories before they're discarded. Essentially, trajectories from policy iterations older than `max_steps_off_policy` steps will not be used.
- **`groups_per_batch`**: Number of new trajectory groups to accumulate (with a `group_size` number of rollouts each) before updating the current iteration of the model. Note: This is separate from the batch size used for dataset construction.
**Usage Guidelines:**
- Async RL is appropriate for applications with long and heterogeneous rollouts, such as very long CoT models, multi-hop tool use, or agentic workflows
- Start with a small value for `max_steps_off_policy` (less than 5)
## Monitoring and Run Health
Using policy-gradient algorithms with off-policy data can significantly degrade performance or even crash the policy, making monitoring essential during training.
### KL Divergence Monitoring
The current implementation logs the KL divergence between the data generation policy and the current learner: $\mathbb{D}_{KL}[\pi_{\text{sampler}}(\cdot|x)||\pi_{\theta}(\cdot|x)]$ using two separate [estimators](http://joschu.net/blog/kl-approx.html):
- `kl_sample_train_v1`
- `kl_sample_train_v2`
A few important notes to keep in mind:
- Even with full on-policy training, the divergence between sampling and learning policies will not be [exactly zero](https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/) due to implementation details
- In our experience training ist stable with KL divergence below 0.01
- If KL divergence crosses the recommended threshold, this indicates a numerical instability or potential issue with the training run
---
## File: rl/rl-loops.mdx
import { CookbookLink } from '../../components/CookbookLink'
# RL Training Loop
In this section we will walk through the RL training loop using a minimum working example on the Hendrycks Math dataset.
## Dataset Construction
The Hendrycks Math environment uses the <CookbookLink path="tinker_cookbook/rl/math_env.py">`MathDatasetBuilder`</CookbookLink> as described in `Creating RL Environments` to create structured batches of mathematical problems:
```python
from tinker_cookbook.rl.math_env import MathDatasetBuilder
dataset_builder = MathDatasetBuilder(
batch_size=64, # 64 problem groups per training batch
group_size=32, # 32 solution attempts per problem
model_name_for_tokenizer="meta-llama/Llama-3.1",
renderer_name="llama3",
convo_prefix="standard" # Includes few-shot examples
)
train_dataset, test_dataset = dataset_builder()
```
Here `MathDatasetBuilder` is a `RLDatasetBuilder` that creates a `RLDataset` where we have a batch of 64 problems, each of which creates a group of 32 identical environments.
## Policy (Completer) Construction
Following the `Completers` section, the policy is implemented as a <CookbookLink path="tinker_cookbook/completers.py">`TinkerTokenCompleter`</CookbookLink> that wraps the sampling client:
```python
import tinker
from tinker_cookbook.completers import TinkerTokenCompleter
service_client = tinker.ServiceClient()
training_client = await service_client.create_lora_training_client_async(
base_model="meta-llama/Llama-3.1-8B", rank=32
)
async def get_policy(step: int):
# Save a checkpoint that you can use for sampling
sampling_path = training_client.save_weights_for_sampler(name="xxxx").result().path
# Create a sampling client with that checkpoint
sampling_client = service_client.create_sampling_client(model_path=sampling_path) #
# Wrap in completer interface for RL algorithms
return TinkerTokenCompleter(
sampling_client=sampling_client,
max_tokens=512
)
```
At each training step the `get_policy` function will save he training weights and update the sampling client with the new weights to generate on-policy data.
## Rollout Collection Loop
The key step of the RL training loop is collecting new on-policy data after each policy update. For each training batch we have
```python
import asyncio
from tinker_cookbook.completers import TokenCompleter
from tinker_cookbook.rl.rollouts import do_group_rollout
async def generate_rollouts(step: int, policy: TokenCompleter):
# Creates 64 builders, each building 32 MathEnv instances
env_group_builders_P = train_dataset.get_batch(step)
# Generate rollouts for each group of 32 environments
trajectory_groups_P = await asyncio.gather(
*[do_group_rollout(builder, policy) for builder in env_group_builders_P]
)
taglist_P = [builder.logging_tags() for builder in env_group_builders_P]
return trajectory_groups_P, taglist_P
```
Here the `do_group_rollout` function will run the policy in the environments and return a list of of Trajectory Groups. The `TrajectoryGroup` is an object that consists of a list of trajectories (a `Trajectory` object) and, final rewards and potentially other metadata. The rollout will return a `batch_size` list of `TrajectoryGroup` objects, each of which consists of `group_size` `Trajectory` objects.
## Data Processing
After we collect new data, we need to process it for training. We compute advantages by centering rewards within each problem group. At this step we can also optionally filter out groups with all successes or all failures as these have policy gradients of zero. Finally the `assemble_training_data` function converts the trajectories into token-level training examples.
```python
from typing import List
from tinker_cookbook.rl.types import TrajectoryGroup
from tinker_cookbook.rl.train import (
_remove_constant_reward_groups,
compute_advantages,
assemble_training_data
)
def process_trajectory_groups(trajectory_groups_P: List[TrajectoryGroup]):
# (Optionally) Remove groups with all successes or all failures
filtered_trajectory_groups_P = (
_remove_constant_reward_groups(trajectory_groups_P)
)
# Compute advantages for each trajectory in each group
advantages_P = compute_advantages(filtered_trajectory_groups_P)
# Convert trajectories to token-level training examples
data_D, _metadata_D = assemble_training_data(filtered_trajectory_groups_P, advantages_P)
return data_D, _metadata_D
```
This function will return a list of training data `types.Datum` objects, as well as a list of metadata dictionaries.
## Model Training Step
The model update uses importance sampling loss:
```python
from typing import List
import torch
import tinker
from tinker import types
from tinker_cookbook.rl.train import _remove_mask
async def train_step(
data_D: List[types.Datum],
training_client: tinker.TrainingClient,
learning_rate: float,
) -> List[torch.Tensor]:
"""Train the model on collected trajectories."""
# Forward-backward pass on all data
fwd_bwd_future = await training_client.forward_backward_async(
list(map(_remove_mask, data_D)), loss_fn="importance_sampling"
)
# Optimizer step
adam_params = types.AdamParams(learning_rate=learning_rate, beta1=0.9, beta2=0.95, eps=1e-8)
optim_step_future = await training_client.optim_step_async(adam_params)
# Wait for results
fwd_bwd_result = await fwd_bwd_future.result_async()
_optim_step_result = await optim_step_future.result_async()
```
Here we use the standard importance sampling loss policy gradient loss function:
```math
\mathcal{L}_{\theta} = -\mathbb{E}_{x\sim \mathcal{D}, y \sim \pi_{\text{sampler}}(\cdot|x)}\left[\textbf{sg}\left(\frac{\pi_{\theta}(y|x)}{\pi_{\text{sampler}}(y|x)}\right)A(x, y)\log\pi_{\theta}(y|x)\right]
```
where we use the group-level advantages $A(x, y)$ to scale the policy gradient. Notice that even when we have exact on-policy RL, slight differences between the trainer and sampler model (even with the same weights) can introduce off-policy issues that can (severely) negatively affect learning.
## Complete Training Loop
Putting it all together, the complete training loop iterates between the following steps:
1. Create a policy with the current model weights
2. Generate rollouts
3. Process trajectory data into training examples
4. Update model parameters
```python
import time
from tinker_cookbook.utils import ml_log
from tinker_cookbook.utils.misc_utils import timed
from tinker_cookbook.rl.metric_util import compute_trajectory_metrics
async def rl_loop():
"""Complete RL training loop for Hendrycks Math problems."""
# Setup dataset
dataset_builder = MathDatasetBuilder(
batch_size=64,
group_size=32,
model_name_for_tokenizer="meta-llama/Llama-3.1-8B",
renderer_name="llama3",
convo_prefix="standard"
)
train_dataset, test_dataset = dataset_builder()
# Setup training client
service_client = tinker.ServiceClient()
training_client = await service_client.create_lora_training_client_async(
"meta-llama/Llama-3.1-8B", rank=32
)
# Setup logging
log_dir = str("~/experiments/math-rl")
num_batches = len(train_dataset)
learning_rate = 1e-5
# Main training loop
for i_batch in range(num_batches):
# Setup metrics for logging
t_start = time.time()
metrics = {
"progress/batch": i_batch,
"optim/lr": learning_rate,
"progress/done_frac": (i_batch + 1) / num_batches,
}
# 1. Create policy with current weights
policy = await get_policy(i_batch)
# 2. Generate rollouts
with timed("generate_rollouts", metrics):
# Generate rollouts: 64 groups × 32 environments = 2,048 total rollouts
trajectory_groups_P, taglist_P = await generate_rollouts(i_batch, policy)
# Compute trajectory metrics
metrics.update(compute_trajectory_metrics(trajectory_groups_P, taglist_P))
# 3. Process trajectory data into training examples
data_D, _metadata_D = process_trajectory_groups(trajectory_groups_P)
# 4. Update model parameters
await train_step(data_D, training_client, learning_rate)
# Log metrics
metrics["time/total"] = time.time() - t_start
ml_logger.log_metrics(metrics, step=i_batch)
```
## Running RL training
To run a training job you can use the our basic RL training <CookbookLink path="tinker_cookbook/recipes/rl_basic.py">`script`</CookbookLink>.
---
## File: rl/rl-envs.mdx
import { CookbookLink } from '../../components/CookbookLink'
# RL Environments
Here, we'll explain how to create your own RL environments and train on them. First, lets look at the basic classes, which can be found in <CookbookLink path="tinker_cookbook/rl/types.py">`tinker_cookbook.rl.types`</CookbookLink>. As you can see, there's an `Env` interface, corresponding to an RL environment. To write an environment, you need to implement two methods: `initial_observation` and `step`.
```python
class Env:
"""
Stateful environment that a single agent interacts with.
Discard after running for one episode.
"""
async def initial_observation(self) -> tuple[Observation, StopCondition]:
raise NotImplementedError
async def step(self, action: Action) -> StepResult:
raise NotImplementedError
```
Note that this `Env` operates on *tokens*, rather than strings or messages. Why define it this way, when it's usually more natural to define the logic in terms of strings or messages? We've defined `Env` this way because this interface is what's needed by the *training* code. The training code needs to know the exact tokens that were sampled, and their logprobs. It might be more convenient to write the logic in terms of strings or messages. See [Env Adapters](/rl/env-adapters) for more on how to write adapters between our `Env` class and other libraries.
We need to write two more small classes to use this environment in the RL training code. First, since the environment is discarded after a single episode, we need to be able to instantiate new environments in the training loop. We actually build a *group* of environments at a time, which enables multi-agent training or objectives that compare multiple samples (for example, a reward model that acts on a pair of samples).
```python
class EnvGroupBuilder:
"""
Builds a group of environments.
"""
async def make_envs(self) -> Sequence[Env]:
raise NotImplementedError
```
This object creates a group of environments. Often it does the trivial thing of returning a list of copies of the same environment.
Finally, we need a dataset of these EnvGroupBuilders.
```python
class RLDataset:
"""
Dataset of EnvGroupBuilders.
"""
def get_batch(self, index: int) -> list[EnvGroupBuilder]:
raise NotImplementedError
```
That's a lot of classes! But their combination gives us a lot of flexibility. In previous implementations (like OpenAI Gym), the dataset is implicitly part of the environment; this structure is more modular and gives us more control over the data loading.
## Building a simple example
You can find an example of writing a new RL environment in the <CookbookLink path="tinker_cookbook/recipes/twenty_questions">Twenty Questions</CookbookLink> directory.
Here, we define a multi-step environment, where we're training a question-asking agent, which asks questions to another agent to guess a hidden word.
In this case, the answerer model is fixed and is Llama-3.1-8B-Instruct.
The player model (which we fine-tune) is also based on that same model.
You can run the training script as follows:
```bash
uv run python -m tinker_cookbook.recipes.twenty_questions.train
```
---
## File: supervised-learning/sl-hyperparams.mdx
# Supervised Learning Hyperparameters
Successful fine-tuning of any LLM requires careful tuning of the hyperparameters. While the most accurate way to tune hyperparameters is to sweep over a range of values for each hyperparameter and pick the values that minimize the loss or maximize the evals, this is often time-consuming and expensive. We provide some starting recommendations for the most important hyperparameters.
## Learning rate
The most important hyperparameter is generally the learning rate (LR). Our current best estimate of optimal LR for a model $m$ is the following:
$$ LR(m) = lr_{base} · M_{LoRA} · \Big(\frac{2000}{H_m}\Big)^{P_m} $$
where $lr_{base}$ is a constant base lr, $M_{LoRA}$ is a multiplier applied when using LoRA (1 if using full-finetuning), $H_m$ is the hidden size of the model $m$, and $P_m$ is a model-specific exponent adjustment. Importantly, note that this function is independent of the LoRA rank.
Our current best estimates are the following: $lr_{base} = 5e-5$,
$M_{LoRA} = 10$, $P_m = 0.0775$ for Qwen models and $P_m = 0.781$ for Llama models.
You can use the following function to get the recommended lr for any model:
```
from tinker_cookbook.hyperparam_utils import get_lr
model_name = "meta-llama/Llama-3.2-1B"
print(get_lr(model_name))
```
We can define the regret of using any lr as the following:
$$regret(lr') = \frac{loss(lr') - min_{lr} loss(lr)}{min_{lr} loss(lr)}$$
We used our lr estimate to run SFT experiments across a variety of datasets, dataset sizes, batch_sizes and lora_ranks, and found the regret to be \<0.5%.
## Batch size
The second most important hyperparameter is the batch size. For small batch sizes, there's a phenomenon of *perfect scaling*, where the LR and batchsize should be varied together as $LR \propto \sqrt{B}$, and the learning curve only depends on $\frac{LR}{\sqrt{B}}$. See e.g., [Shallue et al. (2018)](https://arxiv.org/abs/1811.03600), in the training-from-scratch setting. When fine-tuning LLMs, we're often in a regime where smaller batch sizes give better performance, at the cost of longer training time; moreover, the $LR \propto \sqrt{B}$ scaling doesn't always hold. When doing SL fine-tuning, we recommend using smaller batch sizes like 128, depending on your tolerance for longer training time. For best results, you should aim for at least 100 steps of training (but usually get best results with 1000 or more). [NOTE: we are not confident about these recommendations and need to do further research.]
---
## File: supervised-learning/sl-basic.mdx
import { CookbookLink } from '../../components/CookbookLink'
# Basic Supervised Learning
We've provided an implementation of supervised learning in <CookbookLink path="tinker_cookbook/supervised/train.py">train_cli.py</CookbookLink>. To use this training loop, you'll need to create a `Config` object with the data and parameters.
We've provided an example of a script that starts up an SL training run in <CookbookLink path="tinker_cookbook/recipes/sl_basic.py">sl_basic.py</CookbookLink>. You can run it from the command line as follows:
```bash
python -m tinker_cookbook.recipes.sl_basic
```
This script will fine-tune the Llama-3.1-8B base (pretrained) model on a small dataset called [NoRobots](https://huggingface.co/datasets/HuggingFaceH4/no_robots), created by Hugging Face.
- Each step you should see a printout of the train and test loss, along with other stats like timing.
- The training script will also print out what the data looks like, with predicted tokens (weight=1) in green and context tokens (weight=0) in yellow.
- The training script will write various logs and checkpoint info to the `log_path` directory, which is set to `/tmp/tinker-examples/sl_basic` in the example script.
Looking at this directory, there are several files of interest:
- `metrics.jsonl`: the training metrics that also were printed to the console. You can load and plot them like this:
```python
import pandas
import matplotlib.pyplot as plt
df = pandas.read_json("/tmp/tinker-examples/sl_basic/metrics.jsonl", lines=True)
plt.plot(df['train_mean_nll'], label='train_loss')
plt.plot(df['test/nll'].dropna(), label='test_loss')
plt.legend()
plt.show()
```
You should see a plot like this:

- `checkpoints.jsonl`: this files stores the checkpoints that were saved during training. Recall from [Saving and Loading](/save-load) that there are (currently) two kinds of checkpoints: one that has "/sampler_weights/" in the path (used for sampling), and the other that has "/weights/" in the path (includes full optimizer state, used for resuming training). If you interrupt the training script, then run it again, it will ask you if you want to resume training. If you choose to do so, it'll load the last (full state) checkpoint from this file.
- `config.json`: the configuration that you used for training.
In the `sl_basic` script, you'll see that there's also some disabled code (under `if 0:`) that shows how to use your own dataset, specified as a JSONL file, provided in the format of <CookbookLink path="example-data/conversations.jsonl">conversations.jsonl</CookbookLink>.
---
## File: supervised-learning/prompt-distillation.mdx
import { CookbookLink } from '../../components/CookbookLink'
# Prompt Distillation
Prompt distillation is a training technique in which a model is optimized to behave as though it had been provided with a long and complex prompt, without requiring access to that prompt during inference.
At a high level, this procedure involves two main steps:
- **Creation of distillation data**: A teacher prompt, which is typically lengthy and highly detailed, provides explicit, step-by-step instructions. A teacher model uses this prompt to generate responses for a set of queries.
- **Training the student model**: A student model is then trained (or fine-tuned) on the distilled dataset, thereby learning to reproduce the essential behaviors and reasoning encoded in the teacher’s instructions.
---
## Overview
Let $f_T$ and $f_S$ denote the teacher and student models, respectively. Given an instruction prompt $P$ and a query $q_i$, the teacher model generates a response $r_i$:
$$
r_i = f_T([P, q_i])
$$
Here, the prompt $P$ and the query $q_i$ are concatenated to form the input to the teacher model $f_T$. For a dataset of queries $Q = \{q_i \mid 1 \leq i \leq D\}$, we obtain a corresponding set of teacher responses $R = \{r_i \mid 1 \leq i \leq D\}$.
The distillation training dataset is defined as the set of query–response pairs (excluding the original prompt):
$$
T = \{(q_i, r_i) \mid 1 \leq i \leq D\}.
$$
The student model $f_S$ is then trained to minimize the cross-entropy loss:
$$
\ell(f_S(q_i), r_i) = \ell(f_S(q_i), f_T([P, q_i])).
$$
---
## Example
The Tinker Cookbook provides a prompt distillation recipe tailored for a language classification task. The objective is straightforward: given a text query, the model should predict a two-character code corresponding to the language of the input. The set of possible labels is:
```
ar (Arabic), de (German), el (Greek), en (English), es (Spanish), fr (French), hi (Hindi), ru (Russian), tr (Turkish), ur (Urdu), vi (Vietnamese), zh (Chinese - Simplified), ot (Other/Unknown).
```
The recipe in <CookbookLink path="tinker_cookbook/recipes/prompt_distillation/create_data.py">recipes/prompt_distillation/create_data.py</CookbookLink> also includes handling strategies for inputs containing code, numerical content, or multiple languages.
In the example below, the same model (`Qwen/Qwen3-30B-A3B`) is used as both teacher and student, though in general they need not be identical.
---
### Step 1: Generate Training Data
Create prompt distillation data using the teacher model using <CookbookLink path="tinker_cookbook/recipes/prompt_distillation/create_data.py">recipes/prompt_distillation/create_data.py</CookbookLink>:
```bash
python -m tinker_cookbook.recipes.prompt_distillation.create_data \
output_file=/tmp/tinker-datasets/prompt_distillation_lang.jsonl
```
This command will:
- Use the configured teacher model to generate language classification examples
- Save the distilled dataset to the specified output file
- Create diverse training examples suitable for student model fine-tuning
### Step 2: Train the Student Model
Fine-tune a student model on the distillation data using <CookbookLink path="tinker_cookbook/recipes/prompt_distillation/train.py">recipes/prompt_distillation/train.py</CookbookLink>:
```bash
python -m tinker_cookbook.recipes.prompt_distillation.train
```
The training script will:
- Load the generated distillation dataset
- Apply optimized training configurations
- Fine-tune the student model for language classification
### Step 3: Test Your Model
Once training is complete, you can test your distilled model by sampling from the trained model to verify its performance on language classification tasks.
## Advanced Configuration
The prompt distillation recipe can be customized for different scenarios:
- **Teacher model selection**: Choose different base models based on your requirements
- **Sampling strategies**: Adjust temperature and other generation parameters
- **Data volume**: Scale the number of generated examples based on your needs
- **Training hyperparameters**: Fine-tune learning rates and other training settings
---
## File: supervised-learning/sweep-case-study.mdx
import { CookbookLink } from '../../components/CookbookLink'
# Sweep case study
In [Supervised Learning Hyperparameters](./sl-hyperparams), we introduced default hyperparameters as a starting point. While defaults are useful, optimal values are often task-specific. A hyperparameter sweep (systematically testing values across a range) is a more reliable way to identify the best hyperparameter values.
This page demonstrates how to sweep over the **learning rate (LR)** to find an optimal value.
## Setup
We use the simple supervised learning training loop in
<CookbookLink path="tinker_cookbook/recipes/sl_loop.py">sl_loop.py</CookbookLink>, which trains a Llama-3.1-8B model.
To retrieve the model’s default learning rate recommendation:
```
from tinker_cookbook.hyperparam_utils import get_lr
print(get_lr("meta-llama/Llama-3.1-8B"))
```
This should output
```
0.0002856415043086949 # ≈ 2.8e-4
```
This default value provides a baseline. A common best practice is to sweep one order of magnitude above and below the default. For this case, we sweep over: $lr \in [1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3]$
## Running the sweep
Launch experiments in separate terminal windows for each LR value. For example:
```bash
python -m tinker_cookbook.recipes.sl_loop learning_rate=0.003 log_path=/tmp/sft-lr-sweep/lr-0.003
python -m tinker_cookbook.recipes.sl_loop learning_rate=0.001 log_path=/tmp/sft-lr-sweep/lr-0.001
python -m tinker_cookbook.recipes.sl_loop learning_rate=0.0003 log_path=/tmp/sft-lr-sweep/lr-0.0003
python -m tinker_cookbook.recipes.sl_loop learning_rate=0.0001 log_path=/tmp/sft-lr-sweep/lr-0.0001
python -m tinker_cookbook.recipes.sl_loop learning_rate=0.0003 log_path=/tmp/sft-lr-sweep/lr-0.00003
python -m tinker_cookbook.recipes.sl_loop learning_rate=0.0001 log_path=/tmp/sft-lr-sweep/lr-0.00001
```
You can also automate this process by writing a script that spawns multiple tmux windows and launches experiments programmatically.
## Collecting Results
After the experiments are complete, you can read the `metrics.jsonl` file for all the experiments:
```python
from glob import glob
import pandas
import os
import json
data = []
for fname in sorted(glob(os.path.expanduser("/tmp/sft-lr-sweep/*/metrics.jsonl"))):
df = pandas.read_json(fname, lines=True)
# make sure the experiment is completed
if len(df) == 0 or df["progress"].iloc[-1] < 0.98:
continue
config_fname = fname.replace("metrics.jsonl", "config.json")
with open(config_fname, "rb") as f:
metadata = json.load(f)
data.append({
"fname": fname,
"learning_rate": metadata["learning_rate"],
"final_loss": df["train_mean_nll"].iloc[-1].item()
})
print(f"Read metrics for {len(data)} experiments")
```
If all the experiments are completed, the above code should print:
```
Read metrics for 6 experiments
```
## Visualizing the Sweep
Plot the `final_loss` as a function of `learning_rate`:
```python
import matplotlib.pyplot as plt
df = pandas.DataFrame(data)
plt.plot(df["learning_rate"], df["final_loss"], marker='o')
plt.axhline(y=df["final_loss"].min(), color="green", linestyle="--")
plt.ylim(1.65, 1.8)
plt.xscale("log")
plt.xlabel("Learning Rate (log scale)")
plt.ylabel("Final Loss")
plt.title("Final Loss vs Learning Rate")
plt.show()
```
You should see a U-shaped curve, similar to this:

If the full U-curve is not visible in your setting, expand the sweep range by adding more LR values.
## Determining the Optimal LR
The optimal learning rate is the one that minimizes the loss. The plot above shows that the optimal LR is `3e-4` which you can also calculate by finding the minima:
```
optimal_lr = df["learning_rate"][df["final_loss"].idxmin()]
print(f"The optimal LR is {optimal_lr:.2e}")
```
Expected output:
```
The optimal LR is 3.00e-04
```
Note that the optimal LR in our sweep (`3e-4`) is very close to the default LR (`2.8e-4`).
---
## File: supervised-learning/sl-loop.mdx
import { CookbookLink } from '../../components/CookbookLink'
# Supervised Learning Training Loop
We've provided a simple SL training loop in <CookbookLink path="tinker_cookbook/recipes/sl_loop.py">sl_loop.py</CookbookLink>, which avoids using our dataset classes and instead defines the data loading a more self-contained way. This is for people who like to write their own training loops or learn about how things work under the hood. Our more performant implementation in <CookbookLink path="tinker_cookbook/supervised/train.py">supervised/train.py</CookbookLink> does basically the same thing, but with some performance optimizations, and with some additional features like periodic evals.
---
# PART 2: TYPE DEFINITIONS
Total types collected: 30
## Type: AdamParams
```python
class AdamParams(StrictBase):
learning_rate: float = 0.0001
"""Learning rate for the optimizer"""
beta1: float = 0.9
"""Coefficient used for computing running averages of gradient"""
beta2: float = 0.95
"""Coefficient used for computing running averages of gradient square"""
eps: float = 1e-12
"""Term added to the denominator to improve numerical stability"""
```
## Type: ComputeLogprobsResponse
```python
class ComputeLogprobsResponse(BaseModel):
logprobs: Sequence[Optional[float]]
type: Literal["compute_logprobs"] = "compute_logprobs"
```
## Type: CreateModelResponse
```python
class CreateModelResponse(BaseModel):
model_id: ModelID
type: Literal["create_model"] = "create_model"
```
## Type: Datum
```python
class Datum(StrictBase):
loss_fn_inputs: LossFnInputs
"""Dictionary mapping field names to tensor data"""
model_input: ModelInput
@model_validator(mode="before")
@classmethod
def convert_tensors(cls, data: Any) -> Any:
"""Convert torch.Tensor and numpy arrays to TensorData in loss_fn_inputs during construction."""
if isinstance(data, dict) and "loss_fn_inputs" in data:
loss_fn_inputs = data["loss_fn_inputs"]
if isinstance(loss_fn_inputs, dict):
converted_inputs = {}
for key, value in loss_fn_inputs.items():
converted_inputs[key] = cls._maybe_convert_array(key, value)
data = dict(data) # Make a copy
data["loss_fn_inputs"] = converted_inputs
return data
@classmethod
def _maybe_convert_array(cls, key: str, value: Any) -> Any:
"""Convert torch.Tensor, numpy array, or 1-D list to TensorData if needed."""
if _HAVE_TORCH and isinstance(value, torch.Tensor):
return TensorData.from_torch(value)
elif isinstance(value, np.ndarray):
return TensorData.from_numpy(value)
elif isinstance(value, list):
# assume it's 1d and infer the dtype from the key
return TensorData(data=value, dtype=_key_to_type[key], shape=[len(value)])
else:
return value
_key_to_type = {
"target_tokens": "int64",
"weights": "float32",
"advantages": "float32",
"logprobs": "float32",
}
```
## Type: EncodedTextChunk
```python
class EncodedTextChunk(StrictBase):
tokens: Sequence[int]
"""Array of token IDs"""
type: Literal["encoded_text"] = "encoded_text"
@property
def length(self) -> int:
return len(self.tokens)
```
## Type: ForwardBackwardInput
```python
class ForwardBackwardInput(StrictBase):
data: List[Datum]
"""Array of input data for the forward/backward pass"""
loss_fn: LossFnType
"""Fully qualified function path for the loss function"""
```
## Type: ForwardBackwardOutput
```python
class ForwardBackwardOutput(BaseModel):
loss_fn_output_type: str
"""The type of the ForwardBackward output. Can be one of [...] TODO"""
loss_fn_outputs: List[LossFnOutput]
"""Dictionary mapping field names to tensor data"""
metrics: Dict[str, float]
"""Training metrics as key-value pairs"""
```
## Type: GetInfoResponse
```python
class GetInfoResponse(BaseModel):
type: Optional[Literal["get_info"]] = None
model_data: ModelData
model_id: ModelID
is_lora: Optional[bool] = None
lora_rank: Optional[int] = None
model_name: Optional[str] = None
if PYDANTIC_V2:
# allow fields with a `model_` prefix
model_config = ConfigDict(protected_namespaces=tuple())
```
## Type: GetServerCapabilitiesResponse
```python
class GetServerCapabilitiesResponse(BaseModel):
supported_models: List[SupportedModel]
```
## Type: ImageAssetPointerChunk
```python
class ImageAssetPointerChunk(StrictBase):
format: Literal["png", "jpeg"]
"""Image format"""
height: int
"""Image height in pixels"""
location: str
"""Path or URL to the image asset"""
tokens: int
"""Number of tokens this image represents"""
width: int
"""Image width in pixels"""
type: Literal["image_asset_pointer"] = "image_asset_pointer"
@property
def length(self) -> int:
return self.tokens
```
## Type: LoadWeightsResponse
```python
class LoadWeightsResponse(BaseModel):
path: Optional[str] = None
"""A tinker URI for model weights at a specific step"""
type: Optional[Literal["load_weights"]] = None
```
## Type: LoraConfig
```python
class LoraConfig(StrictBase):
rank: int
"""LoRA rank (dimension of low-rank matrices)"""
seed: Optional[int] = None
"""Seed used for initialization of LoRA weights.
Useful if you need deterministic or reproducible initialization of weights.
"""
train_unembed: bool = True
"""Whether to add lora to the unembedding layer"""
train_mlp: bool = True
"""Whether to add loras to the MLP layers (including MoE layers)"""
train_attn: bool = True
"""Whether to add loras to the attention layers"""
```
## Type: LossFnInputs
```python
LossFnInputs: TypeAlias = Dict[str, TensorData]
```
## Type: LossFnOutput
```python
LossFnOutput: TypeAlias = Dict[str, TensorData]
```
## Type: LossFnType
```python
LossFnType: TypeAlias = Literal["cross_entropy", "importance_sampling", "ppo"]
```
## Type: ModelData
```python
class ModelData(BaseModel):
arch: Optional[str] = None
model_name: Optional[str] = None
```
## Type: ModelID
```python
ModelID: TypeAlias = str
```
## Type: ModelInput
```python
class ModelInput(StrictBase):
chunks: List[ModelInputChunk]
"""Sequence of input chunks (formerly TokenSequence)"""
@classmethod
def from_ints(cls, tokens: List[int]) -> "ModelInput":
"""
Create a ModelInput from a list of ints (tokens).
"""
return cls(chunks=[EncodedTextChunk(tokens=tokens)])
def to_ints(self) -> List[int]:
"""
Convert the ModelInput to a list of ints (tokens)
Throws exception if there are any non-token chunks
"""
if not all(isinstance(chunk, EncodedTextChunk) for chunk in self.chunks):
raise ValueError(f"to_ints only supported for ModelInput with EncodedTextChunks, got {[type(chunk) for chunk in self.chunks]}")
return [token for chunk in self.chunks for token in chunk.tokens]
@property
def length(self) -> int:
"""
Return the total context length used by this ModelInput.
"""
return sum(chunk.length for chunk in self.chunks)
@classmethod
def empty(cls) -> "ModelInput":
"""
Create an empty ModelInput.
"""
return cls(chunks=[])
def append(self, chunk: ModelInputChunk) -> "ModelInput":
"""
Add a new chunk, return a new ModelInput.
"""
return ModelInput(chunks=self.chunks + [chunk])
def append_int(self, token: int) -> "ModelInput":
"""
Add a new token, return a new ModelInput.
"""
return self.append(EncodedTextChunk(tokens=[token]))
```
## Type: ModelInputChunk
```python
ModelInputChunk: TypeAlias = Annotated[
Union[EncodedTextChunk, ImageAssetPointerChunk], PropertyInfo(discriminator="type")
]
```
## Type: OptimStepResponse
```python
class OptimStepResponse(BaseModel):
metrics: Optional[Dict[str, float]] = None
"""Optimization step metrics as key-value pairs"""
```
## Type: SampleResponse
```python
class SampleResponse(BaseModel):
sequences: Sequence[SampledSequence]
type: Literal["sample"] = "sample"
prompt_logprobs: Optional[List[Optional[float]]] = None
"""
If prompt_logprobs was set to true in the request, logprobs are computed for
every token in the prompt. The `prompt_logprobs` response contains a float32
value for every token in the prompt.
"""
```
## Type: SampledSequence
```python
class SampledSequence(BaseModel):
stop_reason: StopReason
"""Reason why sampling stopped"""
tokens: List[int]
"""List of generated token IDs"""
logprobs: Optional[List[float]] = None
"""Log probabilities for each token (optional)"""
```
## Type: SamplingParams
```python
class SamplingParams(BaseModel):
max_tokens: Optional[int] = None
"""Maximum number of tokens to generate"""
seed: Optional[int] = None
"""Random seed for reproducible generation"""
stop: Union[str, Sequence[str], Sequence[int], None] = None
"""Stop sequences for generation"""
temperature: float = 1
"""Sampling temperature"""
top_k: int = -1
"""Top-k sampling parameter (-1 for no limit)"""
top_p: float = 1
"""Nucleus sampling probability"""
```
## Type: SaveWeightsForSamplerResponse
```python
class SaveWeightsForSamplerResponse(BaseModel):
path: str
"""A tinker URI for model weights for sampling at a specific step"""
type: Optional[Literal["save_weights_for_sampler"]] = None
```
## Type: SaveWeightsResponse
```python
class SaveWeightsResponse(BaseModel):
path: str
"""A tinker URI for model weights at a specific step"""
type: Optional[Literal["save_weights"]] = None
```
## Type: StopReason
```python
StopReason: TypeAlias = Literal["length", "stop"]
```
## Type: SupportedModel
```python
class SupportedModel(BaseModel):
model_name: Optional[str] = None
```
## Type: TensorData
```python
class TensorData(StrictBase):
data: Union[List[int], List[float]]
"""Flattened tensor data as array of numbers."""
dtype: TensorDtype
shape: Optional[List[int]] = None
"""Optional.
The shape of the tensor (see PyTorch tensor.shape). The shape of a
one-dimensional list of length N is `(N,)`. Can usually be inferred if not
provided, and is generally inferred as a 1D tensor.
"""
@classmethod
def from_numpy(cls, array: npt.NDArray[Any]) -> "TensorData":
return cls(
data=array.flatten().tolist(),
dtype=_convert_numpy_dtype_to_tensor(array.dtype),
shape=list(array.shape),
)
@classmethod
def from_torch(cls, tensor: "torch.Tensor") -> "TensorData":
return cls(
data=tensor.flatten().tolist(),
dtype=_convert_torch_dtype_to_tensor(tensor.dtype),
shape=list(tensor.shape),
)
def to_numpy(self) -> npt.NDArray[Any]:
"""Convert TensorData to numpy array."""
numpy_dtype = _convert_tensor_dtype_to_numpy(self.dtype)
arr = np.array(self.data, dtype=numpy_dtype)
if self.shape is not None:
arr = arr.reshape(self.shape)
return arr
def to_torch(self) -> "torch.Tensor":
"""Convert TensorData to torch tensor."""
if not _HAVE_TORCH:
raise ImportError("PyTorch is not installed. Cannot convert to torch tensor.")
torch_dtype = _convert_tensor_dtype_to_torch(self.dtype)
tensor = torch.tensor(self.data, dtype=torch_dtype)
if self.shape is not None:
tensor = tensor.reshape(self.shape)
return tensor
def tolist(self) -> List[Any]:
return self.to_numpy().tolist()
def _convert_tensor_dtype_to_numpy(dtype: TensorDtype) -> npt.DTypeLike:
"""Convert TensorDtype to numpy dtype-like."""
if dtype == "float32":
return np.float32
elif dtype == "int64":
return np.int64
else:
raise ValueError(f"Unsupported TensorDtype: {dtype}")
def _convert_tensor_dtype_to_torch(dtype: TensorDtype) -> "torch.dtype":
"""Convert TensorDtype to torch dtype."""
if not _HAVE_TORCH:
raise ImportError("PyTorch is not installed. Cannot convert to torch dtype.")
import torch
if dtype == "float32":
return torch.float32
elif dtype == "int64":
return torch.int64
else:
raise ValueError(f"Unsupported TensorDtype: {dtype}")
def _convert_numpy_dtype_to_tensor(dtype: np.dtype[Any]) -> TensorDtype:
"""Convert numpy dtype to TensorDtype."""
if dtype.kind == "f":
return "float32"
elif dtype.kind == "i":
return "int64"
else:
raise ValueError(f"Unsupported numpy dtype: {dtype}")
def _convert_torch_dtype_to_tensor(dtype: "torch.dtype") -> TensorDtype:
"""Convert torch dtype to TensorDtype."""
# torch.dtype objects have .is_floating_point
if getattr(dtype, "is_floating_point", False):
return "float32"
else:
return "int64"
```
## Type: TensorDtype
```python
TensorDtype: TypeAlias = Literal["int64", "float32"]
```
## Type: UnloadModelResponse
```python
class UnloadModelResponse(BaseModel):
model_id: ModelID
type: Optional[Literal["unload_model"]] = None
```
|