Spaces:
Running
Running
Commit ·
20661cf
1
Parent(s): 2a3e6df
feat: complete visual curriculum for CME 295 with SVG diagrams for all lectures
Browse files- CME295-Transformers/index.html +256 -12
CME295-Transformers/index.html
CHANGED
|
@@ -518,10 +518,33 @@
|
|
| 518 |
</div>
|
| 519 |
`,
|
| 520 |
concepts: `
|
| 521 |
-
<h3>Fundamental NLP Components</h3>
|
| 522 |
<div class="list-item">
|
| 523 |
<div class="list-num">01</div>
|
| 524 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
</div>
|
| 526 |
<div class="list-item">
|
| 527 |
<div class="list-num">02</div>
|
|
@@ -587,11 +610,63 @@
|
|
| 587 |
</div>
|
| 588 |
<div class="list-item">
|
| 589 |
<div class="list-num">03</div>
|
| 590 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
</div>
|
| 592 |
<div class="list-item">
|
| 593 |
<div class="list-num">04</div>
|
| 594 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 595 |
</div>
|
| 596 |
`,
|
| 597 |
math: `
|
|
@@ -653,11 +728,55 @@
|
|
| 653 |
</div>
|
| 654 |
<div class="list-item">
|
| 655 |
<div class="list-num">02</div>
|
| 656 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
</div>
|
| 658 |
<div class="list-item">
|
| 659 |
<div class="list-num">03</div>
|
| 660 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
</div>
|
| 662 |
<div class="list-item">
|
| 663 |
<div class="list-num">04</div>
|
|
@@ -673,11 +792,44 @@
|
|
| 673 |
<h3>LLM Internals & Inference</h3>
|
| 674 |
<div class="list-item">
|
| 675 |
<div class="list-num">01</div>
|
| 676 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
</div>
|
| 678 |
<div class="list-item">
|
| 679 |
<div class="list-num">02</div>
|
| 680 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 681 |
</div>
|
| 682 |
<div class="list-item">
|
| 683 |
<div class="list-num">03</div>
|
|
@@ -715,7 +867,26 @@
|
|
| 715 |
<h3>Pretraining & Scaling Deep Dive</h3>
|
| 716 |
<div class="list-item">
|
| 717 |
<div class="list-num">01</div>
|
| 718 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
</div>
|
| 720 |
<div class="list-item">
|
| 721 |
<div class="list-num">02</div>
|
|
@@ -747,7 +918,30 @@
|
|
| 747 |
</div>
|
| 748 |
<div class="list-item">
|
| 749 |
<div class="list-num">03</div>
|
| 750 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
</div>
|
| 752 |
<div class="list-item">
|
| 753 |
<div class="list-num">04</div>
|
|
@@ -767,7 +961,29 @@
|
|
| 767 |
<h3>Logic, Math & Verification</h3>
|
| 768 |
<div class="list-item">
|
| 769 |
<div class="list-num">01</div>
|
| 770 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
</div>
|
| 772 |
<div class="list-item">
|
| 773 |
<div class="list-num">02</div>
|
|
@@ -827,7 +1043,35 @@
|
|
| 827 |
<h3>The Multi-Senses Architecture</h3>
|
| 828 |
<div class="list-item">
|
| 829 |
<div class="list-num">01</div>
|
| 830 |
-
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 831 |
</div>
|
| 832 |
<div class="list-item">
|
| 833 |
<div class="list-num">02</div>
|
|
|
|
| 518 |
</div>
|
| 519 |
`,
|
| 520 |
concepts: `
|
|
|
|
| 521 |
<div class="list-item">
|
| 522 |
<div class="list-num">01</div>
|
| 523 |
+
<div>
|
| 524 |
+
<strong>Tokenization & BPE:</strong> Transition from word-level to sub-word level. Byte Pair Encoding (BPE) uses a frequency-based merge strategy to build a vocabulary.
|
| 525 |
+
|
| 526 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--green);">
|
| 527 |
+
<h4 style="color: var(--green); margin-bottom: 10px; font-size: 0.85em;">BPE Merging Strategy</h4>
|
| 528 |
+
<svg viewBox="0 0 400 120" style="width: 100%; height: auto;">
|
| 529 |
+
<!-- Initial state -->
|
| 530 |
+
<rect x="20" y="20" width="40" height="30" rx="4" fill="var(--surface)" stroke="var(--text-dim)" />
|
| 531 |
+
<text x="32" y="40" fill="var(--text)" font-size="12">l o</text>
|
| 532 |
+
<rect x="70" y="20" width="40" height="30" rx="4" fill="var(--surface)" stroke="var(--text-dim)" />
|
| 533 |
+
<text x="82" y="40" fill="var(--text)" font-size="12">w</text>
|
| 534 |
+
<rect x="120" y="20" width="40" height="30" rx="4" fill="var(--surface)" stroke="var(--text-dim)" />
|
| 535 |
+
<text x="132" y="40" fill="var(--text)" font-size="12">e r</text>
|
| 536 |
+
|
| 537 |
+
<!-- Merge Step -->
|
| 538 |
+
<path d="M 40 55 L 75 80" stroke="var(--green)" stroke-width="1" marker-end="url(#arrow-green)" />
|
| 539 |
+
<path d="M 90 55 L 85 80" stroke="var(--green)" stroke-width="1" marker-end="url(#arrow-green)" />
|
| 540 |
+
|
| 541 |
+
<rect x="60" y="85" width="60" height="30" rx="4" fill="rgba(46, 204, 113, 0.2)" stroke="var(--green)" />
|
| 542 |
+
<text x="75" y="105" fill="var(--text)" font-size="12">"low"</text>
|
| 543 |
+
<text x="130" y="105" fill="var(--text-dim)" font-size="10">+ "er" ...</text>
|
| 544 |
+
</svg>
|
| 545 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">Frequent sequences (l+o+w) are merged into a single token, reducing sequence length while handling rare words.</p>
|
| 546 |
+
</div>
|
| 547 |
+
</div>
|
| 548 |
</div>
|
| 549 |
<div class="list-item">
|
| 550 |
<div class="list-num">02</div>
|
|
|
|
| 610 |
</div>
|
| 611 |
<div class="list-item">
|
| 612 |
<div class="list-num">03</div>
|
| 613 |
+
<div>
|
| 614 |
+
<strong>The RNN Vanishing Gradient:</strong> Recurrent architectures process tokens sequentially, leading to information loss. Transformers solve this by processing all tokens simultaneously.
|
| 615 |
+
|
| 616 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid #e74c3c;">
|
| 617 |
+
<h4 style="color: #e74c3c; margin-bottom: 10px; font-size: 0.85em;">Sequential (RNN) vs. Parallel (Transformer)</h4>
|
| 618 |
+
<svg viewBox="0 0 400 150" style="width: 100%; height: auto;">
|
| 619 |
+
<!-- RNN Part -->
|
| 620 |
+
<text x="20" y="25" fill="#e74c3c" font-size="10" font-weight="bold">RNN (Bottleneck)</text>
|
| 621 |
+
<rect x="20" y="40" width="30" height="20" rx="2" fill="var(--surface)" stroke="var(--text-dim)" />
|
| 622 |
+
<path d="M 50 50 L 70 50" stroke="var(--text-dim)" marker-end="url(#arrow-red)" />
|
| 623 |
+
<rect x="70" y="40" width="30" height="20" rx="2" fill="var(--surface)" stroke="var(--text-dim)" />
|
| 624 |
+
<path d="M 100 50 L 120 50" stroke="#e74c3c" stroke-width="2" marker-end="url(#arrow-red)" />
|
| 625 |
+
<text x="125" y="55" fill="#e74c3c" font-size="20">...</text>
|
| 626 |
+
|
| 627 |
+
<!-- Transformer Part -->
|
| 628 |
+
<text x="220" y="25" fill="var(--cyan)" font-size="10" font-weight="bold">Transformer (All-at-once)</text>
|
| 629 |
+
<rect x="220" y="40" width="20" height="15" rx="1" fill="var(--cyan)" opacity="0.3" />
|
| 630 |
+
<rect x="250" y="40" width="20" height="15" rx="1" fill="var(--cyan)" opacity="0.3" />
|
| 631 |
+
<rect x="280" y="40" width="20" height="15" rx="1" fill="var(--cyan)" opacity="0.3" />
|
| 632 |
+
<path d="M 230 55 L 290 85" stroke="var(--cyan)" opacity="0.4" />
|
| 633 |
+
<path d="M 260 55 L 290 85" stroke="var(--cyan)" opacity="0.4" />
|
| 634 |
+
<path d="M 290 55 L 290 85" stroke="var(--cyan)" opacity="0.8" />
|
| 635 |
+
<rect x="220" y="85" width="100" height="40" rx="4" fill="var(--surface)" stroke="var(--cyan)" />
|
| 636 |
+
<text x="235" y="110" fill="var(--cyan)" font-size="12">Attention Layer</text>
|
| 637 |
+
</svg>
|
| 638 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">Transformers eliminate the time-step dependency, removing the "forgetting" bottleneck of RNNs.</p>
|
| 639 |
+
</div>
|
| 640 |
+
</div>
|
| 641 |
</div>
|
| 642 |
<div class="list-item">
|
| 643 |
<div class="list-num">04</div>
|
| 644 |
+
<div>
|
| 645 |
+
<strong>Self-Attention Mechanism:</strong> The dynamic computation of weights via Query (Q), Key (K), and Value (V) projections.
|
| 646 |
+
|
| 647 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--yellow);">
|
| 648 |
+
<h4 style="color: var(--yellow); margin-bottom: 10px; font-size: 0.85em;">Query, Key, and Value Interaction</h4>
|
| 649 |
+
<svg viewBox="0 0 400 150" style="width: 100%; height: auto;">
|
| 650 |
+
<!-- Q, K, V nodes -->
|
| 651 |
+
<circle cx="50" cy="40" r="15" fill="var(--cyan)" /> <text x="45" y="45" fill="var(--bg)" font-weight="bold">Q</text>
|
| 652 |
+
<circle cx="150" cy="40" r="15" fill="var(--orange)" /> <text x="145" y="45" fill="var(--bg)" font-weight="bold">K</text>
|
| 653 |
+
<circle cx="250" cy="40" r="15" fill="var(--green)" /> <text x="145" y="45" fill="var(--bg)" font-weight="bold" /> <text x="245" y="45" fill="var(--bg)" font-weight="bold">V</text>
|
| 654 |
+
|
| 655 |
+
<!-- Dot product -->
|
| 656 |
+
<path d="M 65 40 L 135 40" stroke="var(--yellow)" stroke-width="2" marker-end="url(#arrow-yellow)" />
|
| 657 |
+
<text x="75" y="30" fill="var(--yellow)" font-size="10">Internal product (Score)</text>
|
| 658 |
+
|
| 659 |
+
<!-- Weighted Sum -->
|
| 660 |
+
<path d="M 165 40 L 235 40" stroke="var(--text-dim)" opacity="0.5" />
|
| 661 |
+
<path d="M 150 55 L 200 100" stroke="var(--yellow)" marker-end="url(#arrow-yellow)" />
|
| 662 |
+
<path d="M 250 55 L 210 100" stroke="var(--green)" marker-end="url(#arrow-green)" />
|
| 663 |
+
|
| 664 |
+
<rect x="170" y="105" width="60" height="30" rx="4" fill="var(--surface)" stroke="var(--yellow)" />
|
| 665 |
+
<text x="182" y="125" fill="var(--text)" font-size="10">Context Z</text>
|
| 666 |
+
</svg>
|
| 667 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">The Query looks for specific information in the Keys. The score determines how much of the Value is retrieved.</p>
|
| 668 |
+
</div>
|
| 669 |
+
</div>
|
| 670 |
</div>
|
| 671 |
`,
|
| 672 |
math: `
|
|
|
|
| 728 |
</div>
|
| 729 |
<div class="list-item">
|
| 730 |
<div class="list-num">02</div>
|
| 731 |
+
<div>
|
| 732 |
+
<strong>KV Cache & GQA:</strong> Autoregressive generation requires re-calculating attention. Grouped-Query Attention (GQA) reduces memory bandwidth.
|
| 733 |
+
|
| 734 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--cyan);">
|
| 735 |
+
<h4 style="color: var(--cyan); margin-bottom: 10px; font-size: 0.85em;">Head Grouping: MHA vs GQA</h4>
|
| 736 |
+
<svg viewBox="0 0 400 120" style="width: 100%; height: auto;">
|
| 737 |
+
<!-- MHA -->
|
| 738 |
+
<text x="20" y="20" fill="var(--text-dim)" font-size="10">MHA (1:1)</text>
|
| 739 |
+
<rect x="25" y="30" width="10" height="10" fill="var(--cyan)" /> <line x1="35" y1="35" x2="55" y2="35" stroke="var(--orange)" /> <rect x="55" y="30" width="10" height="10" fill="var(--orange)" />
|
| 740 |
+
<rect x="25" y="45" width="10" height="10" fill="var(--cyan)" /> <line x1="35" y1="50" x2="55" y2="50" stroke="var(--orange)" /> <rect x="55" y="45" width="10" height="10" fill="var(--orange)" />
|
| 741 |
+
|
| 742 |
+
<!-- GQA -->
|
| 743 |
+
<text x="220" y="20" fill="var(--cyan)" font-size="10">GQA (Many-to-One)</text>
|
| 744 |
+
<rect x="225" y="30" width="10" height="10" fill="var(--cyan)" />
|
| 745 |
+
<rect x="225" y="45" width="10" height="10" fill="var(--cyan)" />
|
| 746 |
+
<rect x="225" y="60" width="10" height="10" fill="var(--cyan)" />
|
| 747 |
+
<path d="M 235 35 L 265 50" stroke="var(--orange)" />
|
| 748 |
+
<path d="M 235 50 L 265 50" stroke="var(--orange)" />
|
| 749 |
+
<path d="M 235 65 L 265 50" stroke="var(--orange)" />
|
| 750 |
+
<rect x="265" y="45" width="10" height="10" fill="var(--orange)" />
|
| 751 |
+
</svg>
|
| 752 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">GQA shares Key/Value heads across multiple Query heads, drastically reducing the KV Cache memory footprint.</p>
|
| 753 |
+
</div>
|
| 754 |
+
</div>
|
| 755 |
</div>
|
| 756 |
<div class="list-item">
|
| 757 |
<div class="list-num">03</div>
|
| 758 |
+
<div>
|
| 759 |
+
<strong>RoPE (Rotary Position Embeddings):</strong> Relative distance encoded as rotations in the complex plane.
|
| 760 |
+
|
| 761 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--orange);">
|
| 762 |
+
<h4 style="color: var(--orange); margin-bottom: 10px; font-size: 0.85em;">RoPE: Rotation in Latent Space</h4>
|
| 763 |
+
<svg viewBox="0 0 400 120" style="width: 100%; height: auto;">
|
| 764 |
+
<!-- Coordinate System -->
|
| 765 |
+
<circle cx="200" cy="60" r="40" fill="none" stroke="var(--text-dim)" opacity="0.3" stroke-dasharray="2" />
|
| 766 |
+
<line x1="160" y1="60" x2="240" y2="60" stroke="var(--text-dim)" opacity="0.5" />
|
| 767 |
+
<line x1="200" y1="20" x2="200" y2="100" stroke="var(--text-dim)" opacity="0.5" />
|
| 768 |
+
|
| 769 |
+
<!-- Vector 1 -->
|
| 770 |
+
<line x1="200" y1="60" x2="235" y2="40" stroke="var(--cyan)" stroke-width="2" marker-end="url(#arrow-cyan)" />
|
| 771 |
+
<text x="240" y="45" fill="var(--cyan)" font-size="10">Pos 1 (θ)</text>
|
| 772 |
+
|
| 773 |
+
<!-- Vector 2 -->
|
| 774 |
+
<line x1="200" y1="60" x2="180" y2="25" stroke="var(--orange)" stroke-width="2" marker-end="url(#arrow-orange)" />
|
| 775 |
+
<text x="160" y="25" fill="var(--orange)" font-size="10">Pos 2 (2θ)</text>
|
| 776 |
+
</svg>
|
| 777 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">Position is encoded as a rotation angle. The dot product between rotated vectors naturally captures relative distance.</p>
|
| 778 |
+
</div>
|
| 779 |
+
</div>
|
| 780 |
</div>
|
| 781 |
<div class="list-item">
|
| 782 |
<div class="list-num">04</div>
|
|
|
|
| 792 |
<h3>LLM Internals & Inference</h3>
|
| 793 |
<div class="list-item">
|
| 794 |
<div class="list-num">01</div>
|
| 795 |
+
<div>
|
| 796 |
+
<strong>Mixture of Experts (MoE):</strong> Scaling model capacity without scaling compute. For each token, a "Router" selects a subset of "Experts."
|
| 797 |
+
|
| 798 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--green);">
|
| 799 |
+
<h4 style="color: var(--green); margin-bottom: 10px; font-size: 0.85em;">Sparse Expert Routing (MoE)</h4>
|
| 800 |
+
<svg viewBox="0 0 400 120" style="width: 100%; height: auto;">
|
| 801 |
+
<rect x="60" y="45" width="60" height="40" rx="4" fill="var(--surface)" stroke="var(--cyan)" />
|
| 802 |
+
<text x="70" y="70" fill="var(--cyan)" font-size="12">Router</text>
|
| 803 |
+
<line x1="120" y1="65" x2="200" y2="35" stroke="var(--green)" stroke-width="2" marker-end="url(#arrow-green)" />
|
| 804 |
+
<line x1="120" y1="65" x2="200" y2="95" stroke="var(--green)" stroke-width="2" marker-end="url(#arrow-green)" />
|
| 805 |
+
<rect x="200" y="20" width="80" height="30" rx="2" fill="rgba(46, 204, 113, 0.2)" stroke="var(--green)" />
|
| 806 |
+
<text x="215" y="40" fill="var(--text)" font-size="10">Expert 1</text>
|
| 807 |
+
<rect x="200" y="80" width="80" height="30" rx="2" fill="rgba(46, 204, 113, 0.2)" stroke="var(--green)" />
|
| 808 |
+
<text x="215" y="100" fill="var(--text)" font-size="10">Expert N</text>
|
| 809 |
+
</svg>
|
| 810 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">The Router only activates the most relevant Feed-Forward networks for each specific token.</p>
|
| 811 |
+
</div>
|
| 812 |
+
</div>
|
| 813 |
</div>
|
| 814 |
<div class="list-item">
|
| 815 |
<div class="list-num">02</div>
|
| 816 |
+
<div>
|
| 817 |
+
<strong>KV Cache Management (vLLM):</strong> Efficient memory allocation.
|
| 818 |
+
|
| 819 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--orange);">
|
| 820 |
+
<h4 style="color: var(--orange); margin-bottom: 10px; font-size: 0.85em;">PagedAttention: Virtual Memory for LLMs</h4>
|
| 821 |
+
<svg viewBox="0 0 400 100" style="width: 100%; height: auto;">
|
| 822 |
+
<rect x="20" y="30" width="30" height="30" fill="var(--orange)" opacity="0.8" />
|
| 823 |
+
<rect x="55" y="30" width="30" height="30" fill="var(--orange)" opacity="0.4" />
|
| 824 |
+
<path d="M 35 65 L 150 80" stroke="var(--orange)" stroke-dasharray="2" />
|
| 825 |
+
<path d="M 70 65 L 220 50" stroke="var(--orange)" stroke-dasharray="2" />
|
| 826 |
+
<rect x="150" y="65" width="25" height="25" fill="var(--orange)" opacity="0.8" />
|
| 827 |
+
<rect x="180" y="35" width="25" height="25" fill="var(--surface)" stroke="var(--text-dim)" />
|
| 828 |
+
<rect x="210" y="35" width="25" height="25" fill="var(--orange)" opacity="0.4" />
|
| 829 |
+
</svg>
|
| 830 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">Non-contiguous memory slots are mapped to the sequence, eliminating waste and internal fragmentation.</p>
|
| 831 |
+
</div>
|
| 832 |
+
</div>
|
| 833 |
</div>
|
| 834 |
<div class="list-item">
|
| 835 |
<div class="list-num">03</div>
|
|
|
|
| 867 |
<h3>Pretraining & Scaling Deep Dive</h3>
|
| 868 |
<div class="list-item">
|
| 869 |
<div class="list-num">01</div>
|
| 870 |
+
<div>
|
| 871 |
+
<strong>Scaling Laws (Chinchilla):</strong> The relationship between Parameters (N) and Data (D).
|
| 872 |
+
|
| 873 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--cyan);">
|
| 874 |
+
<h4 style="color: var(--cyan); margin-bottom: 10px; font-size: 0.85em;">Chinchilla Compute-Optimal Frontier</h4>
|
| 875 |
+
<svg viewBox="0 0 400 150" style="width: 100%; height: auto;">
|
| 876 |
+
<!-- Axes -->
|
| 877 |
+
<line x1="40" y1="120" x2="360" y2="120" stroke="var(--text-dim)" />
|
| 878 |
+
<line x1="40" y1="20" x2="40" y2="120" stroke="var(--text-dim)" />
|
| 879 |
+
<text x="300" y="140" fill="var(--text-dim)" font-size="9">Tokens (D)</text>
|
| 880 |
+
<text x="10" y="20" fill="var(--text-dim)" font-size="9" transform="rotate(-90 40 20)">Loss</text>
|
| 881 |
+
|
| 882 |
+
<!-- Log Curves -->
|
| 883 |
+
<path d="M 50 110 Q 150 40 350 30" fill="none" stroke="var(--cyan)" stroke-width="2" />
|
| 884 |
+
<circle cx="200" cy="50" r="4" fill="var(--orange)" />
|
| 885 |
+
<text x="210" y="55" fill="var(--orange)" font-size="10">Optimal Balance</text>
|
| 886 |
+
</svg>
|
| 887 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">Tokens and Parameters should be scaled equally for a given compute budget.</p>
|
| 888 |
+
</div>
|
| 889 |
+
</div>
|
| 890 |
</div>
|
| 891 |
<div class="list-item">
|
| 892 |
<div class="list-num">02</div>
|
|
|
|
| 918 |
</div>
|
| 919 |
<div class="list-item">
|
| 920 |
<div class="list-num">03</div>
|
| 921 |
+
<div>
|
| 922 |
+
<strong>Alignment (RLHF & DPO):</strong> Shaping behavior to match human values.
|
| 923 |
+
|
| 924 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--orange);">
|
| 925 |
+
<h4 style="color: var(--orange); margin-bottom: 10px; font-size: 0.85em;">The Alignment Lifecycle (RLHF)</h4>
|
| 926 |
+
<svg viewBox="0 0 400 120" style="width: 100%; height: auto;">
|
| 927 |
+
<!-- Lifecycle Nodes -->
|
| 928 |
+
<rect x="20" y="45" width="60" height="30" rx="2" fill="var(--surface)" stroke="var(--cyan)" />
|
| 929 |
+
<text x="35" y="65" fill="var(--text)" font-size="10">SFT</text>
|
| 930 |
+
<path d="M 80 60 L 120 60" stroke="var(--text-dim)" marker-end="url(#arrow-dim)" />
|
| 931 |
+
|
| 932 |
+
<rect x="120" y="45" width="100" height="30" rx="2" fill="var(--surface)" stroke="#e74c3c" />
|
| 933 |
+
<text x="130" y="65" fill="var(--text)" font-size="10">Reward Model</text>
|
| 934 |
+
<path d="M 220 60 L 260 60" stroke="var(--text-dim)" marker-end="url(#arrow-dim)" />
|
| 935 |
+
|
| 936 |
+
<rect x="260" y="45" width="100" height="30" rx="2" fill="rgba(46, 204, 113, 0.2)" stroke="var(--green)" />
|
| 937 |
+
<text x="275" y="65" fill="var(--text)" font-size="10">PPO / DPO</text>
|
| 938 |
+
|
| 939 |
+
<!-- Feedback loop -->
|
| 940 |
+
<path d="M 310 45 Q 200 0 50 45" fill="none" stroke="var(--green)" stroke-dasharray="2" marker-end="url(#arrow-green)" />
|
| 941 |
+
</svg>
|
| 942 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">Alignment fine-tunes the base model to prioritize helpfulness and safety based on preference data.</p>
|
| 943 |
+
</div>
|
| 944 |
+
</div>
|
| 945 |
</div>
|
| 946 |
<div class="list-item">
|
| 947 |
<div class="list-num">04</div>
|
|
|
|
| 961 |
<h3>Logic, Math & Verification</h3>
|
| 962 |
<div class="list-item">
|
| 963 |
<div class="list-num">01</div>
|
| 964 |
+
<div>
|
| 965 |
+
<strong>Reasoning Trajectories:</strong> Spending "test-time compute" to solve complex problems.
|
| 966 |
+
|
| 967 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--cyan);">
|
| 968 |
+
<h4 style="color: var(--cyan); margin-bottom: 10px; font-size: 0.85em;">CoT vs. Reasoning Models (o1/R1)</h4>
|
| 969 |
+
<svg viewBox="0 0 400 120" style="width: 100%; height: auto;">
|
| 970 |
+
<!-- Prompt -->
|
| 971 |
+
<rect x="10" y="10" width="60" height="20" rx="2" fill="var(--surface)" stroke="var(--cyan)" />
|
| 972 |
+
<text x="18" y="25" fill="var(--text)" font-size="9">Prompt</text>
|
| 973 |
+
|
| 974 |
+
<!-- Hidden Trajectory -->
|
| 975 |
+
<path d="M 70 20 L 150 20 L 150 60 L 250 60" fill="none" stroke="var(--orange)" stroke-width="2" stroke-dasharray="3" />
|
| 976 |
+
<rect x="130" y="70" width="100" height="20" rx="2" fill="rgba(255, 170, 0, 0.1)" stroke="var(--orange)" />
|
| 977 |
+
<text x="140" y="85" fill="var(--orange)" font-size="8">Thinking Tokens...</text>
|
| 978 |
+
|
| 979 |
+
<!-- Output -->
|
| 980 |
+
<path d="M 230 80 L 320 80" stroke="var(--green)" marker-end="url(#arrow-green)" />
|
| 981 |
+
<rect x="320" y="70" width="60" height="25" rx="2" fill="rgba(46, 204, 113, 0.1)" stroke="var(--green)" />
|
| 982 |
+
<text x="335" y="85" fill="var(--text)" font-size="9">Answer</text>
|
| 983 |
+
</svg>
|
| 984 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">Reasoning models explore multiple paths internally before committing to a final visible output.</p>
|
| 985 |
+
</div>
|
| 986 |
+
</div>
|
| 987 |
</div>
|
| 988 |
<div class="list-item">
|
| 989 |
<div class="list-num">02</div>
|
|
|
|
| 1043 |
<h3>The Multi-Senses Architecture</h3>
|
| 1044 |
<div class="list-item">
|
| 1045 |
<div class="list-num">01</div>
|
| 1046 |
+
<div>
|
| 1047 |
+
<strong>Vision Transformers (ViT):</strong> "An Image is worth 16x16 words."
|
| 1048 |
+
|
| 1049 |
+
<div class="visual-demo" style="margin-top: 15px; background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; border: 1px solid var(--cyan);">
|
| 1050 |
+
<h4 style="color: var(--cyan); margin-bottom: 10px; font-size: 0.85em;">Image-to-Patch Tokenization</h4>
|
| 1051 |
+
<svg viewBox="0 0 400 120" style="width: 100%; height: auto;">
|
| 1052 |
+
<!-- Image Grid -->
|
| 1053 |
+
<rect x="20" y="20" width="60" height="60" fill="var(--cyan)" opacity="0.1" stroke="var(--cyan)" />
|
| 1054 |
+
<line x1="40" y1="20" x2="40" y2="80" stroke="var(--cyan)" opacity="0.3" />
|
| 1055 |
+
<line x1="60" y1="20" x2="60" y2="80" stroke="var(--cyan)" opacity="0.3" />
|
| 1056 |
+
<line x1="20" y1="40" x2="80" y2="40" stroke="var(--cyan)" opacity="0.3" />
|
| 1057 |
+
<line x1="20" y1="60" x2="80" y2="60" stroke="var(--cyan)" opacity="0.3" />
|
| 1058 |
+
|
| 1059 |
+
<!-- Flattening -->
|
| 1060 |
+
<path d="M 85 50 L 150 50" stroke="var(--text-dim)" marker-end="url(#arrow-dim)" />
|
| 1061 |
+
|
| 1062 |
+
<!-- Linear Projection -->
|
| 1063 |
+
<rect x="160" y="40" width="20" height="20" fill="var(--cyan)" stroke="var(--cyan)" />
|
| 1064 |
+
<rect x="190" y="40" width="20" height="20" fill="var(--cyan)" stroke="var(--cyan)" />
|
| 1065 |
+
<rect x="220" y="40" width="20" height="20" fill="var(--cyan)" stroke="var(--cyan)" />
|
| 1066 |
+
<text x="250" y="55" fill="var(--text-dim)" font-size="12">...</text>
|
| 1067 |
+
|
| 1068 |
+
<!-- Input to Transformer -->
|
| 1069 |
+
<path d="M 170 35 L 250 15" stroke="var(--orange)" stroke-dasharray="2" />
|
| 1070 |
+
<text x="260" y="20" fill="var(--orange)" font-size="10">Attention Ready</text>
|
| 1071 |
+
</svg>
|
| 1072 |
+
<p style="font-size: 0.75em; color: var(--text-dim);">Images are flattened into patches, enabling the same Transformer architecture to handle visual data as tokens.</p>
|
| 1073 |
+
</div>
|
| 1074 |
+
</div>
|
| 1075 |
</div>
|
| 1076 |
<div class="list-item">
|
| 1077 |
<div class="list-num">02</div>
|