fredmo commited on
Commit
13e6b23
·
verified ·
1 Parent(s): 3c77c49

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +850 -19
index.html CHANGED
@@ -1,19 +1,850 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>The MLOps Engineer's Cheatsheet for Model Serving</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com">
8
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
9
+ <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" rel="stylesheet">
10
+ <link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
11
+ <style>
12
+ /* --- General Setup & Variables --- */
13
+ :root {
14
+ --primary-color: #1E88E5; /* Blue */
15
+ --primary-dark: #1565C0;
16
+ --secondary-color: #004d40; /* Dark Teal for contrast */
17
+ --background-color: #f4f6f8;
18
+ --card-bg-color: #ffffff;
19
+ --text-color: #333;
20
+ --heading-color: #212121;
21
+ --subtle-text-color: #555;
22
+ --border-color: #e0e0e0;
23
+ --code-bg-color: #282c34;
24
+ --code-text-color: #abb2bf;
25
+ --shadow: 0 4px 12px rgba(0,0,0,0.1);
26
+ --tile-hover-shadow: 0 6px 16px rgba(0,0,0,0.15);
27
+ }
28
+
29
+ body {
30
+ font-family: 'Roboto', sans-serif;
31
+ background-color: var(--background-color);
32
+ color: var(--text-color);
33
+ margin: 0;
34
+ padding: 0;
35
+ line-height: 1.6;
36
+ }
37
+
38
+ /* --- Layout & Containers --- */
39
+ .container {
40
+ max-width: 1200px;
41
+ margin: 0 auto;
42
+ padding: 2rem;
43
+ }
44
+
45
+ header {
46
+ text-align: center;
47
+ margin-bottom: 2rem;
48
+ }
49
+
50
+ header h1 {
51
+ color: var(--heading-color);
52
+ font-weight: 700;
53
+ font-size: 2.8rem;
54
+ margin-bottom: 0.5rem;
55
+ }
56
+
57
+ header p {
58
+ font-size: 1.1rem;
59
+ color: var(--subtle-text-color);
60
+ max-width: 800px;
61
+ margin: 0 auto;
62
+ }
63
+
64
+ .main-section-title {
65
+ font-size: 2.2rem;
66
+ color: var(--heading-color);
67
+ border-bottom: 3px solid var(--primary-color);
68
+ padding-bottom: 0.75rem;
69
+ margin-top: 3rem;
70
+ margin-bottom: 2rem;
71
+ display: flex;
72
+ align-items: center;
73
+ }
74
+
75
+ .main-section-title .material-icons {
76
+ font-size: 2.8rem;
77
+ margin-right: 1rem;
78
+ }
79
+
80
+ /* --- Tile Navigation --- */
81
+ .tile-container {
82
+ display: grid;
83
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
84
+ gap: 1.5rem;
85
+ margin-bottom: 2.5rem;
86
+ }
87
+
88
+ .tile {
89
+ background-color: var(--card-bg-color);
90
+ border: 2px solid var(--border-color);
91
+ border-radius: 8px;
92
+ padding: 1.5rem;
93
+ text-align: center;
94
+ cursor: pointer;
95
+ transition: transform 0.2s ease, box-shadow 0.2s ease, border-color 0.2s ease;
96
+ display: flex;
97
+ flex-direction: column;
98
+ align-items: center;
99
+ justify-content: center;
100
+ min-height: 150px;
101
+ }
102
+
103
+ .tile:hover {
104
+ transform: translateY(-5px);
105
+ box-shadow: var(--tile-hover-shadow);
106
+ border-color: var(--primary-color);
107
+ }
108
+
109
+ .tile.active {
110
+ border-color: var(--primary-color);
111
+ box-shadow: var(--tile-hover-shadow);
112
+ background-color: #f0f7ff;
113
+ }
114
+
115
+ .tile .material-icons {
116
+ font-size: 3rem;
117
+ color: var(--primary-color);
118
+ margin-bottom: 1rem;
119
+ }
120
+
121
+ .tile h4 {
122
+ margin: 0;
123
+ font-size: 1.2rem;
124
+ color: var(--heading-color);
125
+ }
126
+
127
+ /* --- Content Panels --- */
128
+ .content-panel {
129
+ display: none; /* Hidden by default, shown by JS */
130
+ background-color: var(--card-bg-color);
131
+ border-radius: 8px;
132
+ box-shadow: var(--shadow);
133
+ padding: 2.5rem;
134
+ margin-top: 1rem;
135
+ }
136
+
137
+ .content-panel.active {
138
+ display: block;
139
+ }
140
+
141
+ .stack-layer {
142
+ margin-bottom: 2.5rem;
143
+ padding-bottom: 1.5rem;
144
+ border-bottom: 1px solid var(--border-color);
145
+ }
146
+
147
+ .stack-layer:last-child {
148
+ border-bottom: none;
149
+ margin-bottom: 0;
150
+ }
151
+
152
+ .stack-layer h3 {
153
+ font-size: 1.6rem;
154
+ color: var(--secondary-color);
155
+ margin-top: 0;
156
+ display: flex;
157
+ align-items: center;
158
+ }
159
+
160
+ .stack-layer h3 .material-icons {
161
+ margin-right: 12px;
162
+ font-size: 2rem;
163
+ }
164
+
165
+ /* --- Collapsible Sections & Code --- */
166
+ details {
167
+ border: 1px solid var(--border-color);
168
+ border-radius: 6px;
169
+ margin-bottom: 1rem;
170
+ background-color: #f9fafb;
171
+ transition: background-color 0.2s ease-in-out;
172
+ }
173
+
174
+ details[open] { background-color: var(--card-bg-color); }
175
+ summary {
176
+ cursor: pointer;
177
+ padding: 1rem;
178
+ font-weight: 500;
179
+ font-size: 1.1rem;
180
+ list-style: none;
181
+ display: flex;
182
+ align-items: center;
183
+ justify-content: space-between;
184
+ }
185
+ summary::-webkit-details-marker { display: none; }
186
+ summary::after {
187
+ font-family: 'Material Icons';
188
+ content: 'expand_more';
189
+ transform: rotate(0deg);
190
+ transition: transform 0.2s ease-in-out;
191
+ }
192
+ details[open] > summary::after { transform: rotate(180deg); }
193
+ .details-content { padding: 0 1rem 1rem 1rem; border-top: 1px solid var(--border-color); }
194
+
195
+ pre {
196
+ background-color: var(--code-bg-color);
197
+ color: var(--code-text-color);
198
+ padding: 1.5rem 1rem 1rem 1rem;
199
+ border-radius: 6px;
200
+ overflow-x: auto;
201
+ font-size: 0.9em;
202
+ position: relative;
203
+ }
204
+ code { font-family: 'Courier New', Courier, monospace; }
205
+ .code-block-header { font-weight: bold; color: var(--subtle-text-color); margin-bottom: -0.5rem; margin-top: 1rem; }
206
+ .copy-btn { position: absolute; top: 10px; right: 10px; background-color: #4a505c; color: #fff; border: none; padding: 6px 10px; border-radius: 4px; cursor: pointer; opacity: 0.7; transition: opacity 0.2s, background-color 0.2s; }
207
+ pre:hover .copy-btn { opacity: 1; }
208
+ .copy-btn:hover { background-color: #6c7382; }
209
+ .copy-btn.copied { background-color: var(--primary-dark); }
210
+ .icon-placeholder { font-style: italic; color: #999; display: inline-block; margin-left: 8px; }
211
+
212
+ </style>
213
+ </head>
214
+ <body>
215
+
216
+ <div class="container">
217
+ <header>
218
+ <h1>The MLOps Engineer's Cheatsheet for Model Serving</h1>
219
+ <p>Select a framework or model type to see a practical guide for serving it—from local code to a production-grade, auto-scaling Kubernetes deployment.</p>
220
+ </header>
221
+
222
+ <main>
223
+ <!-- ======================= Classic ML Tiles ======================= -->
224
+ <h2 class="main-section-title"><i class="material-icons">model_training</i>Classic Machine Learning</h2>
225
+ <div class="tile-container">
226
+ <div class="tile active" data-target="classic-pytorch">
227
+ <span class="material-icons">whatshot</span>
228
+ <h4><!-- ICON PLACEHOLDER: PyTorch -->PyTorch</h4>
229
+ </div>
230
+ <div class="tile" data-target="classic-tensorflow">
231
+ <span class="material-icons">hub</span>
232
+ <h4><!-- ICON PLACEHOLDER: TensorFlow -->TensorFlow</h4>
233
+ </div>
234
+ <div class="tile" data-target="classic-sklearn">
235
+ <span class="material-icons">data_object</span>
236
+ <h4><!-- ICON PLACEHOLDER: Scikit-learn -->Scikit-learn</h4>
237
+ </div>
238
+ <div class="tile" data-target="classic-xgboost">
239
+ <span class="material-icons">trending_up</span>
240
+ <h4><!-- ICON PLACEHOLDER: XGBoost -->XGBoost</h4>
241
+ </div>
242
+ <div class="tile" data-target="classic-jax">
243
+ <span class="material-icons">functions</span>
244
+ <h4><!-- ICON PLACEHOLDER: JAX -->JAX</h4>
245
+ </div>
246
+ </div>
247
+
248
+ <!-- ======================= Generative AI Tiles ======================= -->
249
+ <h2 class="main-section-title"><i class="material-icons">auto_awesome</i>Generative AI</h2>
250
+ <div class="tile-container">
251
+ <div class="tile" data-target="genai-llm">
252
+ <span class="material-icons">chat</span>
253
+ <h4>LLMs</h4>
254
+ </div>
255
+ <div class="tile" data-target="genai-vlm">
256
+ <span class="material-icons">image_search</span>
257
+ <h4>Multimodal (VLMs)</h4>
258
+ </div>
259
+ <div class="tile" data-target="genai-diffusion">
260
+ <span class="material-icons">palette</span>
261
+ <h4>Diffusion Models</h4>
262
+ </div>
263
+ </div>
264
+
265
+ <!-- ======================= Content Panels Container ======================= -->
266
+ <div class="content-container">
267
+
268
+ <!-- === PyTorch Content Panel === -->
269
+ <div id="classic-pytorch" class="content-panel active">
270
+ <div class="stack-layer">
271
+ <h3><i class="material-icons">psychology</i>Model Layer</h3>
272
+ <p>A simple feed-forward network defined in PyTorch. The model's `state_dict` is saved for deployment.</p>
273
+ <p class="code-block-header">model_setup.py</p>
274
+ <pre><code># model_setup.py
275
+ import torch
276
+ import torch.nn as nn
277
+
278
+ class SimpleNet(nn.Module):
279
+ def __init__(self):
280
+ super(SimpleNet, self).__init__()
281
+ self.linear = nn.Linear(10, 1)
282
+
283
+ def forward(self, x):
284
+ return self.linear(x)
285
+
286
+ model = SimpleNet()
287
+ torch.save(model.state_dict(), "pytorch_model.pth")
288
+ print("Model saved to pytorch_model.pth")</code></pre>
289
+ </div>
290
+ <div class="stack-layer">
291
+ <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
292
+ <p>Use a high-performance web framework like FastAPI for the API. For managed serving, KServe and Ray Serve offer powerful abstractions.</p>
293
+ <details>
294
+ <summary>Serve with FastAPI</summary>
295
+ <div class="details-content">
296
+ <pre><code># app.py
297
+ from fastapi import FastAPI
298
+ from pydantic import BaseModel
299
+ import torch
300
+ # ... (include SimpleNet class definition here) ...
301
+
302
+ app = FastAPI(title="PyTorch Model Server")
303
+ model = SimpleNet()
304
+ model.load_state_dict(torch.load("pytorch_model.pth"))
305
+ model.eval()
306
+
307
+ class PredReq(BaseModel): data: list[float]
308
+
309
+ @app.post("/predict")
310
+ def predict(req: PredReq):
311
+ tensor = torch.tensor([req.data], dtype=torch.float32)
312
+ with torch.no_grad():
313
+ pred = model(tensor)
314
+ return {"prediction": pred.item()}
315
+ </code></pre>
316
+ </div>
317
+ </details>
318
+ <details>
319
+ <summary>Serve with Ray Serve <!-- ICON PLACEHOLDER: Ray --></summary>
320
+ <div class="details-content">
321
+ <pre><code># ray_serve_app.py
322
+ from ray import serve
323
+ # ... (include FastAPI app, model class, etc.) ...
324
+
325
+ @serve.deployment
326
+ @serve.ingress(app)
327
+ class ModelServer:
328
+ def __init__(self):
329
+ self.model = SimpleNet()
330
+ self.model.load_state_dict(torch.load("pytorch_model.pth"))
331
+ self.model.eval()
332
+
333
+ # FastAPI handles routing, this class just holds the model
334
+ </code></pre>
335
+ </div>
336
+ </details>
337
+ <details>
338
+ <summary>Serve with KServe <!-- ICON PLACEHOLDER: Kubeflow --></summary>
339
+ <div class="details-content">
340
+ <pre><code># inferenceservice.yaml
341
+ apiVersion: "serving.kserve.io/v1beta1"
342
+ kind: "InferenceService"
343
+ metadata:
344
+ name: "pytorch-model"
345
+ spec:
346
+ predictor:
347
+ pytorch:
348
+ storageUri: "pvc://your-pvc/path/to/model-dir"
349
+ </code></pre>
350
+ </div>
351
+ </details>
352
+ </div>
353
+ <div class="stack-layer">
354
+ <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
355
+ <p>Package the application with a multi-stage Dockerfile and define its runtime with Kubernetes Deployment, Service, and HPA objects.</p>
356
+ <details>
357
+ <summary>Dockerfile</summary>
358
+ <div class="details-content">
359
+ <pre><code># Multi-stage build for a lean final image
360
+ FROM python:3.9-slim as builder
361
+ WORKDIR /install
362
+ RUN pip install --no-cache-dir --prefix="/install" torch fastapi "uvicorn[standard]"
363
+
364
+ FROM python:3.9-slim
365
+ WORKDIR /app
366
+ COPY --from=builder /install /usr/local
367
+ COPY ./app.py /app/
368
+ COPY ./pytorch_model.pth /app/
369
+ EXPOSE 8000
370
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
371
+ </code></pre>
372
+ </div>
373
+ </details>
374
+ <details>
375
+ <summary>Deployment & Service YAML <!-- ICON PLACEHOLDER: Kubernetes --></summary>
376
+ <div class="details-content">
377
+ <pre><code># deployment.yaml
378
+ apiVersion: apps/v1
379
+ kind: Deployment
380
+ metadata:
381
+ name: pytorch-deployment
382
+ spec:
383
+ replicas: 2
384
+ selector: { matchLabels: { app: pytorch } }
385
+ template:
386
+ metadata: { labels: { app: pytorch } }
387
+ spec:
388
+ containers:
389
+ - name: server
390
+ image: your-repo/pytorch-server:latest
391
+ ports:
392
+ - containerPort: 8000
393
+ ---
394
+ # service.yaml
395
+ apiVersion: v1
396
+ kind: Service
397
+ metadata:
398
+ name: pytorch-service
399
+ spec:
400
+ type: NodePort
401
+ selector: { app: pytorch }
402
+ ports:
403
+ - port: 80
404
+ targetPort: 8000
405
+ </code></pre>
406
+ </div>
407
+ </details>
408
+ <details>
409
+ <summary>Autoscaling (HPA) YAML</summary>
410
+ <div class="details-content">
411
+ <pre><code>apiVersion: autoscaling/v2
412
+ kind: HorizontalPodAutoscaler
413
+ metadata:
414
+ name: pytorch-hpa
415
+ spec:
416
+ scaleTargetRef:
417
+ apiVersion: apps/v1
418
+ kind: Deployment
419
+ name: pytorch-deployment
420
+ minReplicas: 1
421
+ maxReplicas: 5
422
+ metrics:
423
+ - type: Resource
424
+ resource: { name: cpu, target: { type: Utilization, averageUtilization: 80 } }
425
+ </code></pre>
426
+ </div>
427
+ </details>
428
+ </div>
429
+ <div class="stack-layer">
430
+ <h3><i class="material-icons">memory</i>Hardware Layer</h3>
431
+ <p><strong>CPUs:</strong> Suitable for small networks or where latency is not critical.<br><strong>GPUs:</strong> Essential for deep learning models to achieve low-latency inference. Use NVIDIA GPUs (T4, A10G, A100) for best performance with CUDA.<br><strong>TPUs:</strong> Best for massive-scale inference on Google Cloud, especially for models trained on TPUs.</p>
432
+ </div>
433
+ </div>
434
+
435
+ <!-- === TensorFlow Content Panel === -->
436
+ <div id="classic-tensorflow" class="content-panel">
437
+ <!-- Content for TensorFlow follows the same 4-layer structure -->
438
+ <div class="stack-layer">
439
+ <h3><i class="material-icons">psychology</i>Model Layer</h3>
440
+ <p>A simple Keras model saved in TensorFlow's `SavedModel` format, which bundles the architecture and weights.</p>
441
+ <p class="code-block-header">model_setup.py</p>
442
+ <pre><code># model_setup.py
443
+ import tensorflow as tf
444
+ model = tf.keras.Sequential([
445
+ tf.keras.layers.Dense(10, activation='relu', input_shape=(10,)),
446
+ tf.keras.layers.Dense(1)
447
+ ])
448
+ model.save("tf_saved_model")
449
+ print("Model saved to tf_saved_model/")</code></pre>
450
+ </div>
451
+ <div class="stack-layer">
452
+ <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
453
+ <p>FastAPI is a great choice for a custom Python server. KServe has native, high-performance support for the `SavedModel` format.</p>
454
+ <details>
455
+ <summary>Serve with FastAPI</summary>
456
+ <div class="details-content">
457
+ <pre><code># app.py
458
+ from fastapi import FastAPI
459
+ from pydantic import BaseModel
460
+ import tensorflow as tf
461
+ import numpy as np
462
+
463
+ app = FastAPI(title="TensorFlow Model Server")
464
+ model = tf.keras.models.load_model("tf_saved_model")
465
+
466
+ class PredReq(BaseModel): data: list[float]
467
+
468
+ @app.post("/predict")
469
+ def predict(req: PredReq):
470
+ pred = model.predict(np.array([req.data]))
471
+ return {"prediction": pred.flatten().tolist()}
472
+ </code></pre>
473
+ </div>
474
+ </details>
475
+ <details>
476
+ <summary>Serve with KServe <!-- ICON PLACEHOLDER: Kubeflow --></summary>
477
+ <div class="details-content">
478
+ <pre><code># inferenceservice.yaml
479
+ apiVersion: "serving.kserve.io/v1beta1"
480
+ kind: "InferenceService"
481
+ metadata:
482
+ name: "tensorflow-model"
483
+ spec:
484
+ predictor:
485
+ tensorflow:
486
+ storageUri: "s3://my-bucket/path/to/tf_saved_model"
487
+ </code></pre>
488
+ </div>
489
+ </details>
490
+ </div>
491
+ <div class="stack-layer">
492
+ <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
493
+ <p>The Kubernetes configuration is very similar to the PyTorch example. Ensure your Dockerfile copies the entire `tf_saved_model` directory and installs the `tensorflow` library.</p>
494
+ </div>
495
+ <div class="stack-layer">
496
+ <h3><i class="material-icons">memory</i>Hardware Layer</h3>
497
+ <p><strong>CPUs:</strong> Good for smaller Keras models. <br><strong>GPUs:</strong> Highly recommended for deep learning models. TensorFlow has excellent CUDA integration. <br><strong>TPUs:</strong> The premier choice for running TensorFlow models at scale, offering the best price/performance on GCP.</p>
498
+ </div>
499
+ </div>
500
+
501
+ <!-- === Scikit-learn Content Panel === -->
502
+ <div id="classic-sklearn" class="content-panel">
503
+ <div class="stack-layer">
504
+ <h3><i class="material-icons">psychology</i>Model Layer</h3>
505
+ <p>A classic logistic regression model. Serialization is typically done with `joblib` for efficiency with NumPy structures.</p>
506
+ <p class="code-block-header">model_setup.py</p>
507
+ <pre><code># model_setup.py
508
+ import joblib
509
+ from sklearn.linear_model import LogisticRegression
510
+ from sklearn.datasets import make_classification
511
+
512
+ X, y = make_classification(n_features=4)
513
+ model = LogisticRegression().fit(X, y)
514
+ joblib.dump(model, "sklearn_model.joblib")
515
+ print("Model saved to sklearn_model.joblib")</code></pre>
516
+ </div>
517
+ <div class="stack-layer">
518
+ <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
519
+ <p>FastAPI provides a simple and fast web server. KServe and Ray Serve also have native support for scikit-learn models.</p>
520
+ <details>
521
+ <summary>Serve with FastAPI</summary>
522
+ <div class="details-content">
523
+ <pre><code># app.py
524
+ from fastapi import FastAPI
525
+ from pydantic import BaseModel
526
+ import joblib, numpy as np
527
+
528
+ app = FastAPI(title="Scikit-learn Server")
529
+ model = joblib.load("sklearn_model.joblib")
530
+
531
+ class PredReq(BaseModel): data: list[list[float]]
532
+
533
+ @app.post("/predict")
534
+ def predict(req: PredReq):
535
+ pred = model.predict(np.array(req.data))
536
+ return {"prediction": pred.tolist()}
537
+ </code></pre>
538
+ </div>
539
+ </details>
540
+ <details>
541
+ <summary>Serve with KServe <!-- ICON PLACEHOLDER: Kubeflow --></summary>
542
+ <div class="details-content">
543
+ <pre><code># inferenceservice.yaml
544
+ apiVersion: "serving.kserve.io/v1beta1"
545
+ kind: "InferenceService"
546
+ metadata:
547
+ name: "sklearn-model"
548
+ spec:
549
+ predictor:
550
+ sklearn:
551
+ storageUri: "pvc://my-pvc/path/to/model-dir" # must contain model.joblib
552
+ </code></pre>
553
+ </div>
554
+ </details>
555
+ </div>
556
+ <div class="stack-layer">
557
+ <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
558
+ <p>Standard Kubernetes setup. The Docker container will be lightweight as it only needs `scikit-learn`, `joblib`, and `fastapi`.</p>
559
+ </div>
560
+ <div class="stack-layer">
561
+ <h3><i class="material-icons">memory</i>Hardware Layer</h3>
562
+ <p><strong>CPUs:</strong> Almost always sufficient. Scikit-learn models are designed to run efficiently on CPUs.<br><strong>GPUs/TPUs:</strong> Not used. There is no GPU acceleration for standard scikit-learn algorithms.</p>
563
+ </div>
564
+ </div>
565
+
566
+ <!-- === XGBoost Content Panel === -->
567
+ <div id="classic-xgboost" class="content-panel">
568
+ <div class="stack-layer">
569
+ <h3><i class="material-icons">psychology</i>Model Layer</h3>
570
+ <p>An XGBoost model saved in its native JSON format, which is portable and human-readable.</p>
571
+ <p class="code-block-header">model_setup.py</p>
572
+ <pre><code># model_setup.py
573
+ import xgboost as xgb
574
+ from sklearn.datasets import make_classification
575
+
576
+ X, y = make_classification(n_features=4)
577
+ dtrain = xgb.DMatrix(X, label=y)
578
+ model = xgb.train({'objective':'binary:logistic'}, dtrain, 10)
579
+ model.save_model("xgboost_model.json")
580
+ print("Model saved to xgboost_model.json")</code></pre>
581
+ </div>
582
+ <div class="stack-layer">
583
+ <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
584
+ <p>KServe and Ray Serve both support XGBoost. A custom FastAPI server is also a robust option.</p>
585
+ <details>
586
+ <summary>Serve with FastAPI</summary>
587
+ <div class="details-content">
588
+ <pre><code># app.py
589
+ from fastapi import FastAPI
590
+ from pydantic import BaseModel
591
+ import xgboost as xgb, numpy as np
592
+
593
+ app = FastAPI(title="XGBoost Server")
594
+ model = xgb.Booster()
595
+ model.load_model("xgboost_model.json")
596
+
597
+ class PredReq(BaseModel): data: list[list[float]]
598
+
599
+ @app.post("/predict")
600
+ def predict(req: PredReq):
601
+ dmatrix = xgb.DMatrix(np.array(req.data))
602
+ pred = model.predict(dmatrix)
603
+ return {"prediction": pred.tolist()}
604
+ </code></pre>
605
+ </div>
606
+ </details>
607
+ </div>
608
+ <div class="stack-layer">
609
+ <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
610
+ <p>Standard Kubernetes setup. The Dockerfile should include the `xgboost` library.</p>
611
+ </div>
612
+ <div class="stack-layer">
613
+ <h3><i class="material-icons">memory</i>Hardware Layer</h3>
614
+ <p><strong>CPUs:</strong> Excellent performance for most use cases.<br><strong>GPUs:</strong> XGBoost has optional GPU acceleration which can provide a significant speedup for large datasets and complex trees during inference.</p>
615
+ </div>
616
+ </div>
617
+
618
+ <!-- === JAX Content Panel === -->
619
+ <div id="classic-jax" class="content-panel">
620
+ <div class="stack-layer">
621
+ <h3><i class="material-icons">psychology</i>Model Layer</h3>
622
+ <p>JAX models are often defined as pure functions with parameters handled separately. We save the parameters using NumPy.</p>
623
+ <p class="code-block-header">model_setup.py</p>
624
+ <pre><code># model_setup.py
625
+ import jax
626
+ import jax.numpy as jnp
627
+ import numpy as np
628
+
629
+ # A pure function for linear regression
630
+ def predict_fn(params, inputs):
631
+ return jnp.dot(inputs, params['w']) + params['b']
632
+
633
+ # Initialize and save dummy parameters
634
+ key = jax.random.PRNGKey(0)
635
+ params = {
636
+ 'w': jax.random.normal(key, (10,)),
637
+ 'b': jnp.array(0.0)
638
+ }
639
+ np.savez("jax_params.npz", **params)
640
+ print("Parameters saved to jax_params.npz")</code></pre>
641
+ </div>
642
+ <div class="stack-layer">
643
+ <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
644
+ <p>Ray Serve is an excellent fit for JAX's functional paradigm. A custom FastAPI server is also straightforward. KServe requires a custom container.</p>
645
+ <details>
646
+ <summary>Serve with FastAPI</summary>
647
+ <div class="details-content">
648
+ <pre><code># app.py
649
+ from fastapi import FastAPI
650
+ from pydantic import BaseModel
651
+ import jax, jax.numpy as jnp, numpy as np
652
+
653
+ # Define predict function and JIT-compile it
654
+ @jax.jit
655
+ def predict_fn(params, inputs):
656
+ return jnp.dot(inputs, params['w']) + params['b']
657
+
658
+ app = FastAPI(title="JAX Server")
659
+ params = np.load("jax_params.npz")
660
+
661
+ class PredReq(BaseModel): data: list[float]
662
+
663
+ @app.post("/predict")
664
+ def predict(req: PredReq):
665
+ pred = predict_fn(params, jnp.array(req.data))
666
+ return {"prediction": pred.tolist()}
667
+ </code></pre>
668
+ </div>
669
+ </details>
670
+ </div>
671
+ <div class="stack-layer">
672
+ <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
673
+ <p>The Kubernetes configuration is standard. The Dockerfile needs to install `jax` and `jaxlib` corresponding to the target hardware (CPU or GPU).</p>
674
+ </div>
675
+ <div class="stack-layer">
676
+ <h3><i class="material-icons">memory</i>Hardware Layer</h3>
677
+ <p><strong>CPUs:</strong> JAX is very fast on CPU.<br><strong>GPUs/TPUs:</strong> JAX was designed for accelerators and excels on GPUs and TPUs, often outperforming other frameworks due to its XLA-based compilation.</p>
678
+ </div>
679
+ </div>
680
+
681
+ <!-- === LLM Content Panel === -->
682
+ <div id="genai-llm" class="content-panel">
683
+ <div class="stack-layer">
684
+ <h3><i class="material-icons">psychology</i>Model Layer</h3>
685
+ <p>Large Language Models (e.g., Llama, Mistral) are based on the Transformer architecture. The key inference challenge is managing the <strong>KV Cache</strong>, a stateful cache of attention keys and values that grows with every generated token and consumes massive amounts of VRAM.</p>
686
+ </div>
687
+ <div class="stack-layer">
688
+ <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
689
+ <p>Specialized serving toolkits are required for efficient LLM inference. These handle complex optimizations like continuous batching and KV cache management.</p>
690
+ <ul>
691
+ <li><strong>vLLM:</strong> A high-throughput serving engine using PagedAttention to optimize KV cache memory, drastically improving throughput.</li>
692
+ <li><strong>Text Generation Inference (TGI):</strong> Hugging Face's production-ready solution with tensor parallelism and optimized kernels.</li>
693
+ <li><strong>TensorRT-LLM:</strong> NVIDIA's library for compiling LLMs into highly optimized engines for NVIDIA GPUs.</li>
694
+ </ul>
695
+ </div>
696
+ <div class="stack-layer">
697
+ <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
698
+ <p>Deployments must request large amounts of GPU resources (`nvidia.com/gpu: 1`) and memory. Node affinity and taints/tolerations are used to schedule pods onto specific GPU node pools (e.g., nodes with A100s).</p>
699
+ </div>
700
+ <div class="stack-layer">
701
+ <h3><i class="material-icons">memory</i>Hardware Layer</h3>
702
+ <p><strong>GPUs:</strong> Essential. High-VRAM GPUs like NVIDIA A100 (40GB/80GB) or H100 (80GB) are required to fit the model weights and KV cache. Multiple GPUs are often needed for larger models via tensor parallelism.</p>
703
+ </div>
704
+ </div>
705
+
706
+ <!-- === VLM Content Panel === -->
707
+ <div id="genai-vlm" class="content-panel">
708
+ <div class="stack-layer">
709
+ <h3><i class="material-icons">psychology</i>Model Layer</h3>
710
+ <p>Visual Large Models (e.g., LLaVA, CogVLM) combine a vision encoder (like ViT) with an LLM. They can process and reason about both images and text, making them powerful but complex to serve.</p>
711
+ </div>
712
+ <div class="stack-layer">
713
+ <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
714
+ <p>The serving stack must handle multi-modal inputs (e.g., base64-encoded images and text in a single JSON payload). Preprocessing the image into tensors is a key part of the serving logic. Frameworks like <strong>vLLM</strong> and <strong>SGLang</strong> are adding support for VLMs.</p>
715
+ </div>
716
+ <div class="stack-layer">
717
+ <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
718
+ <p>Similar to LLMs, VLM deployments require significant GPU and memory resources. The API server (e.g., FastAPI) must be configured to accept large request bodies to accommodate image data.</p>
719
+ </div>
720
+ <div class="stack-layer">
721
+ <h3><i class="material-icons">memory</i>Hardware Layer</h3>
722
+ <p><strong>GPUs:</strong> High-VRAM GPUs are mandatory. The VRAM must accommodate the vision encoder, the LLM, and the KV cache, making memory requirements even higher than for a text-only LLM of a similar size.</p>
723
+ </div>
724
+ </div>
725
+
726
+ <!-- === Diffusion Content Panel === -->
727
+ <div id="genai-diffusion" class="content-panel">
728
+ <div class="stack-layer">
729
+ <h3><i class="material-icons">psychology</i>Model Layer</h3>
730
+ <p>Diffusion models (e.g., Stable Diffusion) generate images through an iterative denoising process. Each step is a full forward pass through a large UNet model, making inference latency a major challenge.</p>
731
+ </div>
732
+ <div class="stack-layer">
733
+ <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
734
+ <p>Optimizing the serving stack focuses on reducing the number of inference steps and speeding up each step.</p>
735
+ <ul>
736
+ <li><strong>Model Compilation:</strong> Use tools like <strong>TensorRT</strong> or `torch.compile` to optimize the UNet and VAE components for the target GPU.</li>
737
+ <li><strong>Latent Consistency Models (LCMs):</strong> A powerful distillation technique that allows for high-quality image generation in just 2-8 steps, drastically cutting latency.</li>
738
+ <li><strong>Custom Pipelines:</strong> Tools like <strong>ComfyUI</strong> or <strong>Diffusers</strong> provide flexible pipelines that can be wrapped in a serving framework like FastAPI or Ray Serve.</li>
739
+ </ul>
740
+ </div>
741
+ <div class="stack-layer">
742
+ <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
743
+ <p>Deployments must be stateful if caching compiled models or dealing with user-specific LoRAs. Persistent Volumes (PVCs) can be used to store these assets. Resource requests for GPU and VRAM are critical.</p>
744
+ </div>
745
+ <div class="stack-layer">
746
+ <h3><i class="material-icons">memory</i>Hardware Layer</h3>
747
+ <p><strong>GPUs:</strong> High-end consumer (e.g., RTX 4090) or datacenter GPUs (A10G, A100) are needed for acceptable generation speeds. VRAM is the most critical resource, as it limits the output resolution and batch size.</p>
748
+ </div>
749
+ </div>
750
+ </div>
751
+
752
+ <!-- ======================= Generic ML Optimization Section ======================= -->
753
+ <div id="optimizations" style="margin-top: 3rem;">
754
+ <h2 class="main-section-title"><i class="material-icons">speed</i>Generic ML Optimization</h2>
755
+ <div class="content-panel active">
756
+ <div class="stack-layer">
757
+ <h3><i class="material-icons">dns</i>Optimize the Cluster</h3>
758
+ <p>Tune the foundation for performance and cost.</p>
759
+ <ul>
760
+ <li><strong>Node Tuning:</strong> Use appropriate machine types (e.g., GPU nodes for DL, compute-optimized for CPU-bound tasks).</li>
761
+ <li><strong>Cluster Autoscaling:</strong> Automatically add/remove nodes based on demand to save costs.</li>
762
+ <li><strong>Network Policies:</strong> Secure inter-service communication within the cluster.</li>
763
+ </ul>
764
+ </div>
765
+ <div class="stack-layer">
766
+ <h3><i class="material-icons">web</i>Optimize the Container & Server</h3>
767
+ <p>Make the serving application itself as efficient as possible.</p>
768
+ <ul>
769
+ <li><strong>Efficient Web Server:</strong> Use ASGI servers (Uvicorn, Hypercorn) with FastAPI over WSGI (Flask) for better async performance.</li>
770
+ <li><strong>Dynamic Batching:</strong> Group incoming requests into a single batch to maximize hardware utilization, especially on GPUs.</li>
771
+ <li><strong>Lean Containers:</strong> Use multi-stage Docker builds to create small, secure production images.</li>
772
+ </ul>
773
+ </div>
774
+ <div class="stack-layer">
775
+ <h3><i class="material-icons">compress</i>Optimize the Model</h3>
776
+ <p>Reduce model size and increase inference speed.</p>
777
+ <ul>
778
+ <li><strong>Quantization:</strong> Reduce model precision (e.g., FP32 to INT8/FP8) to shrink size and accelerate inference.</li>
779
+ <li><strong>Pruning:</strong> Remove unnecessary weights from the model to create a smaller, faster "sparse" version.</li>
780
+ <li><strong>Compilation:</strong> Use tools like TensorRT, OpenVINO, or JAX's JIT to compile the model into highly optimized, hardware-specific code.</li>
781
+ </ul>
782
+ </div>
783
+ </div>
784
+ </div>
785
+ </main>
786
+ </div>
787
+
788
+ <script>
789
+ document.addEventListener('DOMContentLoaded', function() {
790
+ const tiles = document.querySelectorAll('.tile');
791
+ const contentPanels = document.querySelectorAll('.content-panel');
792
+
793
+ // Function to switch active panels
794
+ function switchPanel(event) {
795
+ const targetId = event.currentTarget.dataset.target;
796
+
797
+ // Update tiles
798
+ tiles.forEach(tile => {
799
+ tile.classList.remove('active');
800
+ });
801
+ event.currentTarget.classList.add('active');
802
+
803
+ // Update content panels
804
+ contentPanels.forEach(panel => {
805
+ if (panel.id === targetId) {
806
+ panel.classList.add('active');
807
+ } else {
808
+ // Only hide panels that are part of the tile system
809
+ if (!panel.parentElement.id || panel.parentElement.id !== 'optimizations') {
810
+ panel.classList.remove('active');
811
+ }
812
+ }
813
+ });
814
+ }
815
+
816
+ // Attach click listeners
817
+ tiles.forEach(tile => {
818
+ tile.addEventListener('click', switchPanel);
819
+ });
820
+
821
+ // Add copy buttons to all pre blocks
822
+ const preBlocks = document.querySelectorAll('pre');
823
+ preBlocks.forEach(pre => {
824
+ const code = pre.querySelector('code');
825
+ if (code) {
826
+ const copyButton = document.createElement('button');
827
+ copyButton.innerText = 'Copy';
828
+ copyButton.className = 'copy-btn';
829
+
830
+ copyButton.addEventListener('click', (e) => {
831
+ e.stopPropagation(); // Prevent details/summary from toggling
832
+ navigator.clipboard.writeText(code.innerText).then(() => {
833
+ copyButton.innerText = 'Copied!';
834
+ copyButton.classList.add('copied');
835
+ setTimeout(() => {
836
+ copyButton.innerText = 'Copy';
837
+ copyButton.classList.remove('copied');
838
+ }, 2000);
839
+ }).catch(err => {
840
+ console.error('Failed to copy text: ', err);
841
+ });
842
+ });
843
+
844
+ pre.appendChild(copyButton);
845
+ }
846
+ });
847
+ });
848
+ </script>
849
+ </body>
850
+ </html>