AashishAIHub commited on
Commit
40645d0
·
1 Parent(s): 06d1b56

feat: AI Engineer Masterclass 10/10 curriculum update (Guidebook 2025)

Browse files
Files changed (2) hide show
  1. .gitignore +3 -0
  2. GenAI-AgenticAI/app.js +364 -27
.gitignore CHANGED
@@ -2,3 +2,6 @@
2
  .DS_Store
3
  >__MACOSX/
4
  *.pdf
 
 
 
 
2
  .DS_Store
3
  >__MACOSX/
4
  *.pdf
5
+ .venv/
6
+ node_modules/
7
+ AI Engineering Guidebook.pdf
GenAI-AgenticAI/app.js CHANGED
@@ -1,21 +1,21 @@
1
- // AI Engineer Masterclass — Module Data (based on AI Engineering Guidebook 2025)
2
  const modules = [
3
- { id: 'llm-fundamentals', icon: '🧠', title: 'LLM Fundamentals', desc: 'Tokenization, attention, pre-training, 7 generation parameters, text generation strategies', category: 'Foundation', catClass: 'cat-foundation' },
4
- { id: 'transformers', icon: '⚡', title: 'Transformer Architecture', desc: 'Self-attention math, multi-head attention, positional encoding, MoE vs dense', category: 'Foundation', catClass: 'cat-foundation' },
5
- { id: 'huggingface', icon: '🤗', title: 'Hugging Face Ecosystem', desc: 'Transformers library, Model Hub, Datasets, Spaces, PEFT', category: 'Core Tools', catClass: 'cat-core' },
6
- { id: 'finetuning', icon: '🎯', title: 'Fine-Tuning & PEFT', desc: 'LoRA, QLoRA, SFT vs RFT, GRPO reasoning LLMs, IFT dataset generation', category: 'Core', catClass: 'cat-core' },
7
- { id: 'rag', icon: '🔍', title: 'RAG Pipelines', desc: 'Chunking, embedding models, vector search, re-ranking, HyDE, REFRAG, CAG, Agentic RAG', category: 'Core', catClass: 'cat-core' },
8
- { id: 'vectordb', icon: '🗄️', title: 'Vector Databases', desc: 'FAISS, Pinecone, ChromaDB, HNSW, IVF algorithms', category: 'Core', catClass: 'cat-core' },
9
- { id: 'context-engineering', icon: '🧩', title: 'Context Engineering', desc: 'What to put in context, 6 context types for agents, manual RAG vs agentic context', category: 'Core', catClass: 'cat-core' },
10
- { id: 'agents', icon: '🤖', title: 'AI Agents & Frameworks', desc: 'ReAct, LangChain, LangGraph, CrewAI, AutoGen, 5 levels of agentic AI, memory types', category: 'Agentic', catClass: 'cat-agent' },
11
- { id: 'agentic-patterns', icon: '🔮', title: 'Agentic Design Patterns', desc: '5 design patterns, ReAct from scratch, 4 layers of agentic AI, 30 must-know terms', category: 'Agentic', catClass: 'cat-agent' },
12
- { id: 'multiagent', icon: '🕸️', title: 'Multi-Agent Systems', desc: '7 patterns, orchestration, supervisor, peer-to-peer, A2A & AG-UI protocols', category: 'Agentic', catClass: 'cat-agent' },
13
- { id: 'agent-protocols', icon: '📡', title: 'Agent Protocol Landscape', desc: 'MCP, A2A, AG-UI, Agent Protocol spec, comparison and when to use each', category: 'Agentic', catClass: 'cat-agent' },
14
- { id: 'tools', icon: '🔧', title: 'Function Calling & Tools', desc: 'OpenAI function calling, tool schemas, MCP protocol, JSON prompting', category: 'Agentic', catClass: 'cat-agent' },
15
- { id: 'evaluation', icon: '📊', title: 'Evaluation & Benchmarks', desc: 'LLM-as-a-judge, RAGAS, BLEU/ROUGE, human eval', category: 'Production', catClass: 'cat-production' },
16
- { id: 'guardrails', icon: '🛡️', title: 'Guardrails & Safety', desc: 'Hallucination detection, content filtering, red-teaming', category: 'Production', catClass: 'cat-production' },
17
- { id: 'deployment', icon: '🚀', title: 'Deployment & Serving', desc: 'vLLM, TGI, Ollama, quantization (GPTQ/AWQ/GGUF)', category: 'Production', catClass: 'cat-production' },
18
- { id: 'production', icon: '⚙️', title: 'Production Patterns', desc: 'Caching, streaming, rate limiting, cost optimization', category: 'Production', catClass: 'cat-production' }
19
  ];
20
 
21
 
@@ -71,13 +71,36 @@ const MODULE_CONTENT = {
71
  <h3>4. 4 LLM Text Generation Strategies</h3>
72
  <p>Decoding is the process of picking the next token. How we pick it determines the style of the output.</p>
73
  <ul>
74
- <li><strong>Greedy Strategy:</strong> Always pick the single token with the highest probability. <em>Issue:</em> Often leads to repetitive, low-quality loops.</li>
75
- <li><strong>Multinomial Sampling:</strong> Sample from the probability distribution (controlled by temperature). <em>Benefit:</em> Much more creative and human-like.</li>
76
- <li><strong>Beam Search:</strong> Explores multiple parallel paths ("beams") and picks the sequence with the highest total probability. <em>Best for:</em> Translation and code where sequence-level correctness matters more than creativity.</li>
77
- <li><strong>Nucleus (Top-p) Sampling:</strong> Restricts sampling to a dynamic "nucleus" of tokens that sum to probability p. <em>Best for:</em> General purpose chat.</li>
78
  </ul>
79
 
80
- <h3>4. Context Window — The LLM's Working Memory</h3>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  <p>The context window determines how many tokens the model can process in a single call (input + output combined).</p>
82
  <table>
83
  <tr><th>Model</th><th>Context Window</th><th>Approx. Pages</th></tr>
@@ -915,14 +938,24 @@ Object.assign(MODULE_CONTENT, {
915
 
916
  <h3>5. Traditional RAG vs HyDE</h3>
917
  <div class="comparison">
918
- <div class="comparison-bad"><strong>Naive:</strong> Embed "How is company X doing?". Vector search searches for fragments of that query.</div>
919
- <div class="comparison-good"><strong>HyDE:</strong> LLM writes a <em>hypothetical</em> investor report for company X. We embed THAT report. Vector search finds similar *actual* reports.</div>
920
  </div>
921
 
922
- <h3>6. RAG vs Agentic RAG and AI Memory</h3>
923
- <p>Standard RAG is a <strong>one-shot</strong> process. <strong>Agentic RAG</strong> allows an agent to decide how to search, what to search, and when to stop. Combined with <strong>AI Memory</strong> (persisting relevant facts across sessions), this creates systems that grow smarter with user interaction.</p>
 
 
 
 
 
 
 
924
 
925
- <h3>7. Evaluating RAG (RAGAS)</h3>
 
 
 
926
  <table>
927
  <tr><th>Metric</th><th>Measures</th><th>Target</th></tr>
928
  <tr><td><strong>Faithfulness</strong></td><td>Are claims supported by context?</td><td>0.9+</td></tr>
@@ -2299,6 +2332,310 @@ agent_card = {
2299
  </div>`
2300
  };
2301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2302
  // ─── Dashboard Render ───────────────────────────────────────────────────────
2303
  function renderDashboard() {
2304
  const grid = document.getElementById('modulesGrid');
 
1
+ // AI Engineer Masterclass — Module Data (Updated for AI Engineering Guidebook 2025 Edition)
2
  const modules = [
3
+ { id: 'llm-fundamentals', icon: '🧠', title: 'LLM Fundamentals', desc: 'Decoding strategies (SLED, Contrastive), 3 distillation types, 4 local setups, 7 generation params', category: 'Foundation', catClass: 'cat-foundation' },
4
+ { id: 'transformers', icon: '⚡', title: 'Transformer Architecture', desc: 'KV Cache, GQA vs MQA, FlashAttention-3, RoPE, MoE routing patterns', category: 'Foundation', catClass: 'cat-foundation' },
5
+ { id: 'huggingface', icon: '🤗', title: 'Hugging Face Ecosystem', desc: 'Transformers, PEFT, TRL, Safetensors, Hub API, programmatic deployment', category: 'Core Tools', catClass: 'cat-core' },
6
+ { id: 'finetuning', icon: '🎯', title: 'Fine-Tuning & PEFT', desc: 'LoRA from scratch, QLoRA, SFT vs RFT, GRPO/DeepSeek-R1 logic, IFT generation', category: 'Core', catClass: 'cat-core' },
7
+ { id: 'rag', icon: '🔍', title: 'RAG Pipelines', desc: '8 architectures (HyDE, REFRAG, CAG), 5 chunking types, RAGAS, context recall', category: 'Core', catClass: 'cat-core' },
8
+ { id: 'vectordb', icon: '🗄️', title: 'Vector Databases', desc: 'HNSW levels, IVF-PQ, DiskANN, distance metrics, metadata filtering patterns', category: 'Core', catClass: 'cat-core' },
9
+ { id: 'context-engineering', icon: '🧩', title: 'Context Engineering', desc: '6 context types, context window management, Claude skills, dynamic assembly', category: 'Core', catClass: 'cat-core' },
10
+ { id: 'agents', icon: '🤖', title: 'AI Agents & Frameworks', desc: 'ReAct loops, 5 levels of autonomy, 4 layers of architecture, 8 multi-agent patterns', category: 'Agentic', catClass: 'cat-agent' },
11
+ { id: 'agentic-patterns', icon: '🔮', title: 'Agentic Design Patterns', desc: 'Reflection, Tool-use, Planning, Multi-agent, Memory, ReAct from scratch', category: 'Agentic', catClass: 'cat-agent' },
12
+ { id: 'multiagent', icon: '🕸️', title: 'Multi-Agent Systems', desc: '7 Patterns (Parallel, Router, Sequential, Hierarchical), swarm logic, orchestration', category: 'Agentic', catClass: 'cat-agent' },
13
+ { id: 'agent-protocols', icon: '📡', title: 'Agent Protocol Landscape', desc: 'Standardizing Agent-to-Tool (MCP), Agent-to-Agent (A2A), and Agent-to-User (AG-UI)', category: 'Agentic', catClass: 'cat-agent' },
14
+ { id: 'tools', icon: '🔧', title: 'Function Calling & Tools', desc: 'Tool schemas, JSON prompting, Verbalized Sampling, few-shot tool usage', category: 'Agentic', catClass: 'cat-agent' },
15
+ { id: 'evaluation', icon: '📊', title: 'Evaluation & Benchmarks', desc: 'LLM-as-a-judge, DeepEval, DeepSeek benchmarks, quality vs latency tradeoffs', category: 'Production', catClass: 'cat-production' },
16
+ { id: 'guardrails', icon: '🛡️', title: 'Guardrails & Safety', desc: 'Hallucination detection, LlamaGuard, NeMo, input/output filtering', category: 'Production', catClass: 'cat-production' },
17
+ { id: 'deployment', icon: '🚀', title: 'Deployment & Serving', desc: 'vLLM, TGI, PagedAttention, quantization (GGUF/EXL2), serving at scale', category: 'Production', catClass: 'cat-production' },
18
+ { id: 'production', icon: '⚙️', title: 'Production Patterns', desc: 'Observability, tracing, rate limiting, token cost optimization, caching', category: 'Production', catClass: 'cat-production' }
19
  ];
20
 
21
 
 
71
  <h3>4. 4 LLM Text Generation Strategies</h3>
72
  <p>Decoding is the process of picking the next token. How we pick it determines the style of the output.</p>
73
  <ul>
74
+ <li><strong>Greedy Strategy:</strong> Navely chooses the word with the highest probability. <em>Issue:</em> Often leads to repetitive, low-quality loops.</li>
75
+ <li><strong>Multinomial Sampling:</strong> Sample from the probability distribution available (controlled by temperature). <em>Benefit:</em> Much more creative and human-like.</li>
76
+ <li><strong>Beam Search:</strong> Explores top-K partial sequences (beams) and keeps alternatives alive. <em>Best for:</em> Translation and code where sequence-level correctness matters most.</li>
77
+ <li><strong>Contrastive Search:</strong> Balances fluency with diversity by penalizing continuations too similar to previous tokens. <em>Best for:</em> Long stories or creative writing to avoid loops.</li>
78
  </ul>
79
 
80
+ <div class="callout insight">
81
+ <div class="callout-title">🚀 Bonus: SLED (Self-Logits Evolution Decoding)</div>
82
+ <p>Normally, models use only the final layer's logits. <strong>SLED</strong> looks at how logits evolve across ALL layers. It measures consensus across internal representations, producing more <strong>grounded and factual</strong> outputs without retraining or extra data.</p>
83
+ </div>
84
+
85
+ <h3>5. 3 Techniques to Train an LLM using another LLM</h3>
86
+ <p>LLMs don't just learn from raw text; they transfer knowledge between models (Distillation). Distillation happens at Pre-training (Llama 4 Scout) or Post-training (DeepSeek-R1 to Qwen).</p>
87
+ <table>
88
+ <tr><th>Technique</th><th>How It Works</th><th>Advantage / Note</th></tr>
89
+ <tr><td><strong>1. Soft-label Distillation</strong></td><td>Student matches the Teacher's entire probability distribution (softmax).</td><td>Maximum reasoning transfer; requires access to logits.</td></tr>
90
+ <tr><td><strong>2. Hard-label Distillation</strong></td><td>Student matches only the final token choice (hard label) of the Teacher.</td><td>DeepSeek used this to distill R1 into smaller models.</td></tr>
91
+ <tr><td><strong>3. Co-distillation</strong></td><td>Train Teacher and Student together from scratch; both predict concurrently.</td><td>Gemma 3 used this during both pre and post-training.</td></tr>
92
+ </table>
93
+
94
+ <h3>6. 4 Ways to Run LLMs Locally</h3>
95
+ <table>
96
+ <tr><th>Tool</th><th>Best For</th><th>Setup Complexity</th></tr>
97
+ <tr><td><strong>Ollama</strong></td><td>Simple CLI/Desktop usage; one-command 'run'.</td><td>Zero (Just install app)</td></tr>
98
+ <tr><td><strong>LM Studio</strong></td><td>GUI for chatting with GGUF models; eject/load easily.</td><td>Zero (Desktop app)</td></tr>
99
+ <tr><td><strong>vLLM</strong></td><td>High-performance serving and API hosting.</td><td>Low-Medium (pip install)</td></tr>
100
+ <tr><td><strong>llama.cpp</strong></td><td>Extreme portability (C++), runs on Mac/CPU/Android.</td><td>Medium (compile or download bins)</td></tr>
101
+ </table>
102
+
103
+ <h3>7. Context Window — The LLM's Working Memory</h3>
104
  <p>The context window determines how many tokens the model can process in a single call (input + output combined).</p>
105
  <table>
106
  <tr><th>Model</th><th>Context Window</th><th>Approx. Pages</th></tr>
 
938
 
939
  <h3>5. Traditional RAG vs HyDE</h3>
940
  <div class="comparison">
941
+ <div class="box-bad"><strong>Naive:</strong> Embed "How is company X doing?". Vector search searches for fragments of that query.</div>
942
+ <div class="box-good"><strong>HyDE:</strong> LLM writes a <em>hypothetical</em> investor report for company X. We embed THAT report. Vector search finds similar *actual* reports.</div>
943
  </div>
944
 
945
+ <h3>6. 5 Chunking Strategies for RAG</h3>
946
+ <table>
947
+ <tr><th>Strategy</th><th>Description</th><th>Best For</th></tr>
948
+ <tr><td><strong>1. Fixed-size</strong></td><td>Chunking by character/token count</td><td>Simple text, general usage</td></tr>
949
+ <tr><td><strong>2. Recursive</strong></td><td>Split by paragraph, then sentence</td><td>Most documents (LangChain default)</td></tr>
950
+ <tr><td><strong>3. Semantic</strong></td><td>Split where embedding distance jumps</td><td>Long books or unified narratives</td></tr>
951
+ <tr><td><strong>4. Structural</strong></td><td>Split by HTML headings or PDF sections</td><td>Technical docs, manuals</td></tr>
952
+ <tr><td><strong>5. Agentic</strong></td><td>LLM analyzes text and groups it</td><td>Highest accuracy, highest cost</td></tr>
953
+ </table>
954
 
955
+ <h3>7. RAG vs Agentic RAG and AI Memory</h3>
956
+ <p>Standard RAG is a <strong>one-shot</strong> process. <strong>Agentic RAG</strong> allows an agent to decide how to search, what to search, and when to stop. Combined with <strong>AI Memory</strong> (persisting relevant facts across sessions), this creates systems that grow smarter over time.</p>
957
+
958
+ <h3>8. Evaluating RAG (RAGAS)</h3>
959
  <table>
960
  <tr><th>Metric</th><th>Measures</th><th>Target</th></tr>
961
  <tr><td><strong>Faithfulness</strong></td><td>Are claims supported by context?</td><td>0.9+</td></tr>
 
2332
  </div>`
2333
  };
2334
 
2335
+ Object.assign(MODULE_CONTENT, {
2336
+ 'prompt-engineering': {
2337
+ concepts: `
2338
+ <div class="section">
2339
+ <h2>✍️ Prompting for Agents — Complete Deep Dive</h2>
2340
+ <div class="info-box">
2341
+ <div class="box-title">⚡ Why Specialized Prompting?</div>
2342
+ <div class="box-content">Agentic AI requires different prompting strategies than simple chat. Instead of asking for a final answer, we prompt the model to <strong>reason, format data, and make decisions</strong>. The output must be perfectly parseable by our code.</div>
2343
+ </div>
2344
+
2345
+ <h3>1. Three Prompting Techniques for Reasoning</h3>
2346
+ <p>To improve an LLM's logical accuracy and ability to plan, use these three structural techniques:</p>
2347
+ <table>
2348
+ <tr><th>Technique</th><th>How It Works</th><th>Best For</th></tr>
2349
+ <tr><td><strong>Chain of Thought (CoT)</strong></td><td>Adding "Think step by step" to the prompt, forcing the model to generate intermediate reasoning tokens before the final answer.</td><td>Math, logic, code, and agent planning.</td></tr>
2350
+ <tr><td><strong>Tree of Thoughts (ToT)</strong></td><td>Prompting the model to generate multiple possible paths, evaluate them, and perform search (BFS/DFS) across paths to find the optimal solution.</td><td>Complex multi-step reasoning where backtracking is needed.</td></tr>
2351
+ <tr><td><strong>Self-Consistency</strong></td><td>Prompting the model to generate N different CoT sequences, then taking the majority vote for the final answer.</td><td>Highly reliable fact extraction and math problems.</td></tr>
2352
+ </table>
2353
+
2354
+ <h3>2. Verbalized Sampling (Agent "Thinking Out Loud")</h3>
2355
+ <p>When an agent selects a tool to call, it can easily make a mistake if it just outputs the tool name directly. <strong>Verbalized Sampling</strong> requires the agent to explicitly output its reasoning process <em>before</em> outputting the tool call or action.</p>
2356
+ <div class="callout tip">
2357
+ <div class="callout-title">💡 Why Verbalized Sampling Works</div>
2358
+ <p>Because LLMs generate tokens autoregressively (left-to-right), the tokens representing the "thought" become part of the context for generating the "tool call" tokens. The tool call intrinsically becomes more accurate because it's physically conditioned on a logical rationale.</p>
2359
+ </div>
2360
+
2361
+ <h3>3. JSON Prompting for LLMs</h3>
2362
+ <p>Agents and orchestrators need structured data (JSON), not conversational text. If the JSON is malformed, the application crashes.</p>
2363
+ <ul>
2364
+ <li><strong>Define a Schema:</strong> Always provide a strict schema in the system prompt. "Output JSON matching this typescript interface: { ... }"</li>
2365
+ <li><strong>Use JSON Mode:</strong> Use <code>response_format: { type: "json_object" }</code> (OpenAI). The model is guaranteed to output valid JSON.</li>
2366
+ <li><strong>Stop Sequences:</strong> For older or open models, using <code>}</code> as a stop sequence guarantees no trailing text.</li>
2367
+ <li><strong>Pre-filling the Assistant:</strong> Append <code>{</code> to the end of your prompt so the model is forced to start generating JSON keys instantly without saying "Here is the JSON...".</li>
2368
+ </ul>
2369
+ </div>\`,
2370
+ code: \`
2371
+ <div class="section">
2372
+ <h2>💻 Prompting for Agents — Code Examples</h2>
2373
+
2374
+ <h3>1. Verbalized Sampling Prompt Template</h3>
2375
+ <div class="code-block"><span class="keyword">const</span> system_prompt = <span class="string">\`You are a sophisticated AI agent with access to tools.
2376
+ When given a task, you MUST use the following format:
2377
+
2378
+ Thought: Consider what you need to do, step by step. Which tool is needed?
2379
+ Action: The name of the tool to use (e.g. "search_web", "calculate")
2380
+ Action Input: The arguments for the tool in valid JSON.
2381
+
2382
+ You MUST articulate your Thought before your Action.\`</span></div>
2383
+
2384
+ <h3>2. Forcing JSON on Open Models</h3>
2385
+ <div class="code-block"><span class="keyword">import</span> { pipeline } <span class="keyword">from</span> <span class="string">"@huggingface/transformers"</span>;
2386
+
2387
+ <span class="comment">// Provide schema and force it to start with {</span>
2388
+ <span class="keyword">const</span> prompt = <span class="string">"Extract name and age. Return JSON: {\"name\": string, \"age\": number}\n\nText: John is 25.\nOutput:\n{"</span>;
2389
+
2390
+ <span class="keyword">const</span> generator = <span class="keyword">await</span> pipeline(<span class="string">"text-generation"</span>, <span class="string">"meta-llama/Llama-3.2-1B-Instruct"</span>);
2391
+ <span class="keyword">const</span> out = <span class="keyword">await</span> generator(prompt, {
2392
+ max_new_tokens: <span class="number">50</span>,
2393
+ stop_strings: [<span class="string">"}"</span>] <span class="comment">// Stop generation exactly when JSON closes</span>
2394
+ });
2395
+ <span class="keyword">const</span> raw = <span class="string">"{"</span> + out[0].generated_text; <span class="comment">// Prepend the '{' that we forced</span>
2396
+ <span class="keyword">const</span> json = JSON.parse(raw);</div>
2397
+ </div>\`,
2398
+ interview: \`
2399
+ <div class="section">
2400
+ <h2>🎯 Prompt Engineering — Interview Questions</h2>
2401
+ <div class="interview-box"><strong>Q1: Why does Chain of Thought work?</strong><p><strong>Answer:</strong> It provides additional computational steps (tokens) for the model to process logic. Since an LLM spends a fixed amount of computation per token, forcing it to generate a 50-token thought process before answering allocates 50x more computation to solving the problem than just answering immediately.</p></div>
2402
+ <div class="interview-box"><strong>Q2: How is JSON Prompting different from OpenAI Function Calling?</strong><p><strong>Answer:</strong> JSON prompting is done via the text prompt and relies on the model's instruction following (good for open models). Function/Tool calling is a native API feature where the provider fine-tunes the model explicitly to output arguments matching a schema via constrained decoding, ensuring much higher reliability.</p></div>
2403
+ </div>\`
2404
+ },
2405
+ 'llm-optimization': {
2406
+ concepts: \`
2407
+ <div class="section">
2408
+ <h2>🗜️ LLM Optimization — Complete Deep Dive</h2>
2409
+ <div class="info-box">
2410
+ <div class="box-title">⚡ Why Optimization Matters</div>
2411
+ <div class="box-content">LLMs are massively compute- and memory-bound. A 70B parameter model in FP16 takes 140GB of VRAM just to load. Optimizing how models are compressed and how inference runs determines whether an application is economically viable.</div>
2412
+ </div>
2413
+
2414
+ <h3>1. Model Compression</h3>
2415
+ <p>Compression reduces the memory footprint and increases memory bandwidth, leading to faster token generation.</p>
2416
+ <table>
2417
+ <tr><th>Technique</th><th>How It Works</th><th>Tradeoff</th></tr>
2418
+ <tr><td><strong>Quantization (PTQ)</strong></td><td>Convert parameters from FP16 (16-bit) to INT8 or INT4. Ex: GGUF, AWQ, GPTQ.</td><td>Slight accuracy loss, massive speed/memory gains.</td></tr>
2419
+ <tr><td><strong>Pruning</strong></td><td>Set near-zero weights to exactly zero, creating sparse matrices.</td><td>Requires specialized hardware for sparse acceleration.</td></tr>
2420
+ <tr><td><strong>Knowledge Distillation</strong></td><td>Train a smaller model (student) to match the probability distribution of a large model (teacher).</td><td>High upfront compute cost to train the student.</td></tr>
2421
+ </table>
2422
+
2423
+ <h3>2. Regular ML Inference vs LLM Inference</h3>
2424
+ <div class="comparison">
2425
+ <div class="comparison-bad"><strong>Regular ML (e.g., ResNet):</strong> One input → One forward pass → One output. Compute-bound (matrix multiplication speed is the bottleneck). Highly batchable.</div>
2426
+ <div class="comparison-good"><strong>LLM Inference:</strong> Autoregressive. One input → forward pass → 1 token → append token → forward pass → 1 token. Memory-bandwidth bound (reading huge weights from HBM to SRAM for every single token).</div>
2427
+ </div>
2428
+
2429
+ <h3>3. KV Caching in LLMs</h3>
2430
+ <p>During generation, each new token needs to pay attention to all past tokens. Recomputing the Key (K) and Value (V) matrices for all past tokens for every new token generation is <strong>O(N²)</strong> and terribly slow.</p>
2431
+ <p><strong>KV Caching</strong> stores the K and V tensors for all past tokens in GPU memory. For the next step, only the Q, K, V for the <em>newest token</em> are computed, and attention is run against the cached past. This reduces computation to <strong>O(N)</strong> per step.</p>
2432
+ <div class="callout warning">
2433
+ <div class="callout-title">⚠️ The KV Cache Bottleneck</div>
2434
+ <p>While KV caching solves the compute problem, it introduces a memory problem. A 100K context window across high batch sizes can cause the KV cache to consume more GPU RAM than the model weights themselves! This is why techniques like <strong>PagedAttention</strong> (vLLM) and <strong>GQA (Grouped Query Attention)</strong> were invented.</p>
2435
+ </div>
2436
+ </div>\`,
2437
+ code: \`
2438
+ <div class="section">
2439
+ <h2>💻 LLM Optimization — Code Examples</h2>
2440
+
2441
+ <h3>1. GGUF Quantization via Llama.cpp</h3>
2442
+ <div class="code-block"><span class="comment"># Convert HF model to 4-bit GGUF using llama.cpp</span>
2443
+ python llama.cpp/convert_hf_to_gguf.py models/Llama-3-8B \
2444
+ --outfile models/llama3-8b.gguf \
2445
+ --outtype q4_k_m
2446
+
2447
+ <span class="comment"># Run highly optimized inference locally in C++</span>
2448
+ ./llama.cpp/main -m models/llama3-8b.gguf -n 256 -p "Explain KV caching"</div>
2449
+
2450
+ <h3>2. AWQ Quantization in Python</h3>
2451
+ <div class="code-block"><span class="keyword">from</span> awq <span class="keyword">import</span> AutoAWQForCausalLM
2452
+ <span class="keyword">from</span> transformers <span class="keyword">import</span> AutoTokenizer
2453
+
2454
+ model_path = <span class="string">"meta-llama/Llama-3-8B"</span>
2455
+ quant_path = <span class="string">"llama-3-8b-awq"</span>
2456
+ quant_config = { <span class="string">"zero_point"</span>: True, <span class="string">"q_group_size"</span>: 128, <span class="string">"w_bit"</span>: 4 }
2457
+
2458
+ <span class="comment"># Quantize and save</span>
2459
+ model = AutoAWQForCausalLM.from_pretrained(model_path)
2460
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
2461
+ model.quantize(tokenizer, quant_config=quant_config)
2462
+ model.save_quantized(quant_path)
2463
+ tokenizer.save_pretrained(quant_path)</div>
2464
+ </div>\`,
2465
+ interview: \`
2466
+ <div class="section">
2467
+ <h2>🎯 LLM Optimization — Interview Questions</h2>
2468
+ <div class="interview-box"><strong>Q1: What is the difference between compute-bound and memory-bandwidth bound?</strong><p><strong>Answer:</strong> Compute-bound means the GPU spends all its time doing math (matrix multiplications). Memory-bandwidth bound means the math is easy, but the GPU spends all its time waiting for weights to be copied from High Bandwidth Memory (HBM) to on-chip SRAM. LLM prefill (reading the prompt) is compute-bound, but decoding (generating tokens one by one) is memory-bandwidth bound.</p></div>
2469
+ <div class="interview-box"><strong>Q2: Assume you use vLLM. What is PagedAttention?</strong><p><strong>Answer:</strong> Normally, KV cache is pre-allocated continuously in GPU memory. Because output lengths are unknown, frameworks over-allocate memory, wasting up to 60%. PagedAttention divides the KV cache into small blocks (pages) and allocates them dynamically, like virtual memory in an OS. This allows near-zero waste and 2-4x higher concurrency (batching).</p></div>
2470
+ </div>\`
2471
+ },
2472
+ 'llm-observability': {
2473
+ concepts: \`
2474
+ <div class="section">
2475
+ <h2>🔭 LLM Observability — Complete Deep Dive</h2>
2476
+ <div class="info-box">
2477
+ <div class="box-title">⚡ Evaluation vs. Observability</div>
2478
+ <div class="box-content"><strong>Evaluation</strong> happens offline or asynchronously to check if a model meets a standard (metrics, RAGAS scores). <strong>Observability</strong> is runtime monitoring of production systems to understand exactly what the model did, how long it took, what tools it called, and how much it cost.</div>
2479
+ </div>
2480
+
2481
+ <h3>1. Key Observability Metrics</h3>
2482
+ <ul>
2483
+ <li><strong>Latency:</strong> Time-to-First-Token (TTFT), Tokens-Per-Second (TPS), Total request duration.</li>
2484
+ <li><strong>Cost & Usage:</strong> Token tracking (Prompt tokens vs Completion tokens), dollar cost calculation per provider.</li>
2485
+ <li><strong>Trace Visualization:</strong> For agents and chains, tracking the exact tree of execution. (User Prompt -> Retriever -> Tool Call 1 -> Tool Call 2 -> Final LLM Answer).</li>
2486
+ <li><strong>Quality/Feedback:</strong> Capturing user thumbs up/down, implicit feedback (copy-pasting the result).</li>
2487
+ </ul>
2488
+
2489
+ <h3>2. Tracing Implementations</h3>
2490
+ <p>Standard software uses APM (Datadog, New Relic) for tracing. LLMs require specialized APMs (Langfuse, LangSmith, Helicone, Opik) because the payload sizes are huge and the dependencies (prompts, tool results) are non-standard.</p>
2491
+ <table>
2492
+ <tr><th>Platform</th><th>Best For</th></tr>
2493
+ <tr><td><strong>LangSmith</strong></td><td>Deep LangChain integration and easy local debugging.</td></tr>
2494
+ <tr><td><strong>Langfuse</strong></td><td>Open-source, framework-agnostic tracing with a great UI.</td></tr>
2495
+ <tr><td><strong>Helicone</strong></td><td>Proxy-based observability (just change the base URL, no SDK needed).</td></tr>
2496
+ <tr><td><strong>Opik (by Comet)</strong></td><td>Agent optimization and evaluation natively integrated with traces.</td></tr>
2497
+ </table>
2498
+ </div>\`,
2499
+ code: \`
2500
+ <div class="section">
2501
+ <h2>💻 Observability — Code Examples</h2>
2502
+
2503
+ <h3>1. Langfuse Tracing with OpenAI</h3>
2504
+ <div class="code-block"><span class="keyword">from</span> langfuse.openai <span class="keyword">import</span> OpenAI
2505
+ <span class="comment"># Drop-in replacement! Automatically traces all API calls.</span>
2506
+
2507
+ client = OpenAI()
2508
+
2509
+ response = client.chat.completions.create(
2510
+ model=<span class="string">"gpt-4o"</span>,
2511
+ messages=[{<span class="string">"role"</span>: <span class="string">"user"</span>, <span class="string">"content"</span>: <span class="string">"Write a Python script."</span>}],
2512
+ name=<span class="string">"script-generation"</span>, <span class="comment"># Optional trace grouping</span>
2513
+ metadata={<span class="string">"user_id"</span>: <span class="string">"123"</span>, <span class="string">"env"</span>: <span class="string">"production"</span>}
2514
+ )</div>
2515
+
2516
+ <h3>2. LangSmith Tracing via Decorators</h3>
2517
+ <div class="code-block"><span class="keyword">from</span> langsmith <span class="keyword">import</span> traceable
2518
+ <span class="keyword">from</span> openai <span class="keyword">import</span> Client
2519
+
2520
+ client = Client()
2521
+
2522
+ <span class="preprocessor">@traceable</span>(name=<span class="string">"RAG Pipeline"</span>)
2523
+ <span class="keyword">def</span> <span class="function">my_rag_agent</span>(query):
2524
+ <span class="comment"># All LLM calls inside this function get nested inside the "RAG Pipeline" trace segment</span>
2525
+ context = retrieve_docs(query) <span class="comment"># Can add @traceable to this too</span>
2526
+
2527
+ resp = client.chat.completions.create(
2528
+ model=<span class="string">"gpt-4o"</span>,
2529
+ messages=[{<span class="string">"role"</span>: <span class="string">"user"</span>, <span class="string">"content"</span>: <span class="string">f"Context: {context}\nQuery: {query}"</span>}]
2530
+ )
2531
+ <span class="keyword">return</span> resp.choices[<span class="number">0</span>].message.content</div>
2532
+ </div>\`,
2533
+ interview: \`
2534
+ <div class="section">
2535
+ <h2>🎯 Observability — Interview Questions</h2>
2536
+ <div class="interview-box"><strong>Q1: What is Time-To-First-Token (TTFT) and why does it matter?</strong><p><strong>Answer:</strong> TTFT measures the latency from the moment the user sends the request until the first token streams back to the client. In LLM applications, total end-to-end latency might be 5-10 seconds, which is unacceptable for UX. Streaming combined with low TTFT (&lt;1 second) creates the illusion of speed and keeps users engaged.</p></div>
2537
+ <div class="interview-box"><strong>Q2: Why use a proxy like Helicone over an SDK like LangSmith?</strong><p><strong>Answer:</strong> A proxy requires ZERO code changes — you simply change the API base URL from <code>api.openai.com</code> to <code>oai.hconeai.com</code> and pass your proxy key in the header. It automatically logs all prompts, responses, costs, and latencies. However, an SDK (like Langfuse/LangSmith) is required if you want deep, nested trace trees for complex agents (e.g., seeing exactly which step in a 10-step LangGraph flow failed).</p></div>
2538
+ </div>\`
2539
+ },
2540
+ 'multiagent': {
2541
+ concepts: \`
2542
+ <div class="section">
2543
+ <h2>🕸️ Multi-Agent Systems (MAS)</h2>
2544
+ <div class="info-box">
2545
+ <div class="box-title">Why Multiple Agents?</div>
2546
+ <div class="box-content">Monolithic agents (one LLM with one giant prompt) are brittle. Specialized agents that collaborate reduce friction, improve accuracy, and enable parallel processing. MAS is about <strong>orchestration</strong> — how agents talk to each other to solve big tasks.</div>
2547
+ </div>
2548
+
2549
+ <h3>7 Core Patterns of Multi-Agent Orchestration</h3>
2550
+ <table>
2551
+ <tr><th>Pattern</th><th>How It Works</th><th>Best For</th></tr>
2552
+ <tr><td><strong>1. Parallel</strong></td><td>Tasks run concurrently across specialists (e.g., searcher + coder).</td><td>Reducing latency in complex pipelines.</td></tr>
2553
+ <tr><td><strong>2. Sequential</strong></td><td>Step-by-step handoff (e.g., Coder → Reviewer → Deployer).</td><td>ETL chains, code development, linear workflows.</td></tr>
2554
+ <tr><td><strong>3. Loop</strong></td><td>Continuous refinement until quality threshold is met.</td><td>Report generation, creative writing, proofreading.</td></tr>
2555
+ <tr><td><strong>4. Router</strong></td><td>Controller agent directs task to the right specialist.</td><td>Customer support (billing vs technical vs sales).</td></tr>
2556
+ <tr><td><strong>5. Aggregator</strong></td><td>Many agents form opinions; one central agent merges them.</td><td>Consensus voting, RAG retrieval fusion.</td></tr>
2557
+ <tr><td><strong>6. Network</strong></td><td>No hierarchy; agents communicate freely (peer-to-peer).</td><td>Simulations, games, collective brainstorming.</td></tr>
2558
+ <tr><td><strong>7. Hierarchical</strong></td><td>Planner/Manager agent delegates to workers and tracks progress.</td><td>Large enterprise projects with many sub-tasks.</td></tr>
2559
+ </table>
2560
+
2561
+ <h3>A2A: The Protocol for Teamwork</h3>
2562
+ <p><strong>Agent-to-Agent (A2A)</strong> protocol standardizes how these agents exchange context and instructions. Instead of sharing a global state, they exchange <strong>Agent Cards</strong> and <strong>Task Payloads</strong>. This allows an agent built in CrewAI to delegate a task to an agent built in LangGraph.</p>
2563
+
2564
+ <div class="callout tip">
2565
+ <div class="callout-title">💡 Minimizing Friction</div>
2566
+ <p>When picking a pattern, prioritize minimizing communication overhead. 10 agents isn't better than 2 if they duplicate work. The system should feel smarter than its individual parts.</p>
2567
+ </div>
2568
+ </div>\`,
2569
+ code: \`
2570
+ <div class="section">
2571
+ <h2>💻 Multi-Agent — Code Examples</h2>
2572
+ <h3>Simple Router Pattern with LiteLLM</h3>
2573
+ <div class="code-block"><span class="keyword">from</span> litellm <span class="keyword">import</span> completion
2574
+
2575
+ <span class="keyword">def</span> <span class="function">router_agent</span>(query):
2576
+ <span class="comment"># Intent classification</span>
2577
+ intent = completion(
2578
+ model=<span class="string">"gpt-4o-mini"</span>,
2579
+ messages=[{<span class="string">"role"</span>: <span class="string">"user"</span>, <span class="string">"content"</span>: f<span class="string">"Classify: {query}. Labels: [CODING, FINANCE, GENERAL]"</span>}]
2580
+ ).choices[0].message.content
2581
+
2582
+ <span class="keyword">if</span> <span class="string">"CODING"</span> <span class="keyword">in</span> intent:
2583
+ <span class="keyword">return</span> coding_specialist(query)
2584
+ <span class="keyword">elif</span> <span class="string">"FINANCE"</span> <span class="keyword">in</span> intent:
2585
+ <span class="keyword">return</span> finance_specialist(query)
2586
+ <span class="keyword">else</span>:
2587
+ <span class="keyword">return</span> general_agent(query)</div>
2588
+ </div>\`,
2589
+ interview: \`
2590
+ <div class="section">
2591
+ <h2>🎯 Multi-Agent — Interview Questions</h2>
2592
+ <div class="interview-box"><strong>Q1: Parallel vs Sequential orchestration?</strong><p><strong>Answer:</strong> Parallel is for independent tasks (data extraction + web search) to reduce latency. Sequential is for dependent tasks where step B needs output of step A (code writing then code review). Use parallel for scale, sequential for quality-controlled pipelines.</p></div>
2593
+ <div class="interview-box"><strong>Q2: What is the Hierarchical pattern?</strong><p><strong>Answer:</strong> It mimics a corporate structure: a Manager/Planner agent receives the high-level goal, breaks it into sub-tasks, and delegates them to specialized Worker agents. The Manager tracks state and makes the final quality check. Best for complex, ambiguous projects.</p></div>
2594
+ </div>\`
2595
+ },
2596
+ 'tools': {
2597
+ concepts: \`
2598
+ <div class="section">
2599
+ <h2>🔧 Function Calling & Tools</h2>
2600
+ <div class="info-box">
2601
+ <div class="box-title">Tools: The Hands of the LLM</div>
2602
+ <div class="box-content">An LLM without tools is just a talker. Tools give LLMs <strong>agency</strong>. Function calling allows models to generate structured arguments for functions you've defined, enabling real-world actions like database queries, web searches, or code execution.</div>
2603
+ </div>
2604
+
2605
+ <h3>1. The Function Calling Lifecycle</h3>
2606
+ <p>1. <strong>Definition:</strong> Send tool schemas (JSON) to LLM. 2. <strong>Selection:</strong> LLM realizes it needs a tool and outputs <code>tool_calls</code>. 3. <strong>Execution:</strong> Your code runs the tool locally. 4. <strong>Feedback:</strong> Result is sent back to LLM. 5. <strong>Final Output:</strong> LLM uses result to answer user.</p>
2607
+
2608
+ <h3>2. JSON Prompting vs Native Tooling</h3>
2609
+ <p><strong>JSON Prompting:</strong> Manually instructing the model to output JSON (used for open models). <strong>Native Tooling:</strong> Using provider APIs (OpenAI/Claude) which use <strong>constrained decoding</strong> for 99.9% reliability.</p>
2610
+
2611
+ <h3>3. Verbalized Sampling</h3>
2612
+ <p>Forcing the agent to generate a "Thought:" block before the "Action:" block. This conditions the tool selection on a logical premise, significantly reducing errors in choosing the wrong tool or arguments.</p>
2613
+ </div>\`,
2614
+ code: \`
2615
+ <div class="section">
2616
+ <h2>💻 Tools — Code Examples</h2>
2617
+ <h3>OpenAI Native Tool Call</h3>
2618
+ <div class="code-block"><span class="keyword">tools</span> = [{
2619
+ <span class="string">"type"</span>: <span class="string">"function"</span>,
2620
+ <span class="string">"function"</span>: {
2621
+ <span class="string">"name"</span>: <span class="string">"get_stock_price"</span>,
2622
+ <span class="string">"parameters"</span>: {
2623
+ <span class="string">"type"</span>: <span class="string">"object"</span>,
2624
+ <span class="string">"properties"</span>: {<span class="string">"symbol"</span>: {<span class="string">"type"</span>: <span class="string">"string"</span>}}
2625
+ }
2626
+ }
2627
+ }]
2628
+ <span class="comment"># Pass this to chat.completions.create(..., tools=tools)</span></div>
2629
+ </div>\`,
2630
+ interview: \`
2631
+ <div class="section">
2632
+ <h2>🎯 Tools — Interview Questions</h2>
2633
+ <div class="interview-box"><strong>Q1: Why use Verbalized Sampling in tool calling?</strong><p><strong>Answer:</strong> It forces the model to articulate a rationale *before* picking a tool. Since tokens are generated left-to-right, the tool selection becomes conditioned on the reasoning, which increases precision, especially when multiple similar tools exist.</p></div>
2634
+ </div>\`
2635
+ }
2636
+ });
2637
+
2638
+
2639
  // ─── Dashboard Render ───────────────────────────────────────────────────────
2640
  function renderDashboard() {
2641
  const grid = document.getElementById('modulesGrid');