Spaces:

fredmo
/

mlserving

Running

App Files Files Community

fredmo commited on Jun 9, 2025

Commit

13e6b23

verified ·

1 Parent(s): 3c77c49

Update index.html

Browse files

Files changed (1) hide show

index.html +850 -19

index.html CHANGED Viewed

@@ -1,19 +1,850 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
-</html>

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>The MLOps Engineer's Cheatsheet for Model Serving</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" rel="stylesheet">
+    <link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
+    <style>
+        /* --- General Setup & Variables --- */
+        :root {
+            --primary-color: #1E88E5; /* Blue */
+            --primary-dark: #1565C0;
+            --secondary-color: #004d40; /* Dark Teal for contrast */
+            --background-color: #f4f6f8;
+            --card-bg-color: #ffffff;
+            --text-color: #333;
+            --heading-color: #212121;
+            --subtle-text-color: #555;
+            --border-color: #e0e0e0;
+            --code-bg-color: #282c34;
+            --code-text-color: #abb2bf;
+            --shadow: 0 4px 12px rgba(0,0,0,0.1);
+            --tile-hover-shadow: 0 6px 16px rgba(0,0,0,0.15);
+        }
+        body {
+            font-family: 'Roboto', sans-serif;
+            background-color: var(--background-color);
+            color: var(--text-color);
+            margin: 0;
+            padding: 0;
+            line-height: 1.6;
+        }
+        /* --- Layout & Containers --- */
+        .container {
+            max-width: 1200px;
+            margin: 0 auto;
+            padding: 2rem;
+        }
+        header {
+            text-align: center;
+            margin-bottom: 2rem;
+        }
+        header h1 {
+            color: var(--heading-color);
+            font-weight: 700;
+            font-size: 2.8rem;
+            margin-bottom: 0.5rem;
+        }
+        header p {
+            font-size: 1.1rem;
+            color: var(--subtle-text-color);
+            max-width: 800px;
+            margin: 0 auto;
+        }
+        .main-section-title {
+            font-size: 2.2rem;
+            color: var(--heading-color);
+            border-bottom: 3px solid var(--primary-color);
+            padding-bottom: 0.75rem;
+            margin-top: 3rem;
+            margin-bottom: 2rem;
+            display: flex;
+            align-items: center;
+        }
+        .main-section-title .material-icons {
+             font-size: 2.8rem;
+             margin-right: 1rem;
+        }
+        /* --- Tile Navigation --- */
+        .tile-container {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
+            gap: 1.5rem;
+            margin-bottom: 2.5rem;
+        }
+        .tile {
+            background-color: var(--card-bg-color);
+            border: 2px solid var(--border-color);
+            border-radius: 8px;
+            padding: 1.5rem;
+            text-align: center;
+            cursor: pointer;
+            transition: transform 0.2s ease, box-shadow 0.2s ease, border-color 0.2s ease;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: center;
+            min-height: 150px;
+        }
+        .tile:hover {
+            transform: translateY(-5px);
+            box-shadow: var(--tile-hover-shadow);
+            border-color: var(--primary-color);
+        }
+        .tile.active {
+            border-color: var(--primary-color);
+            box-shadow: var(--tile-hover-shadow);
+            background-color: #f0f7ff;
+        }
+        .tile .material-icons {
+            font-size: 3rem;
+            color: var(--primary-color);
+            margin-bottom: 1rem;
+        }
+        .tile h4 {
+            margin: 0;
+            font-size: 1.2rem;
+            color: var(--heading-color);
+        }
+        /* --- Content Panels --- */
+        .content-panel {
+            display: none; /* Hidden by default, shown by JS */
+            background-color: var(--card-bg-color);
+            border-radius: 8px;
+            box-shadow: var(--shadow);
+            padding: 2.5rem;
+            margin-top: 1rem;
+        }
+        .content-panel.active {
+            display: block;
+        }
+        .stack-layer {
+            margin-bottom: 2.5rem;
+            padding-bottom: 1.5rem;
+            border-bottom: 1px solid var(--border-color);
+        }
+        .stack-layer:last-child {
+            border-bottom: none;
+            margin-bottom: 0;
+        }
+        .stack-layer h3 {
+            font-size: 1.6rem;
+            color: var(--secondary-color);
+            margin-top: 0;
+            display: flex;
+            align-items: center;
+        }
+        .stack-layer h3 .material-icons {
+            margin-right: 12px;
+            font-size: 2rem;
+        }
+        /* --- Collapsible Sections & Code --- */
+        details {
+            border: 1px solid var(--border-color);
+            border-radius: 6px;
+            margin-bottom: 1rem;
+            background-color: #f9fafb;
+            transition: background-color 0.2s ease-in-out;
+        }
+        details[open] { background-color: var(--card-bg-color); }
+        summary {
+            cursor: pointer;
+            padding: 1rem;
+            font-weight: 500;
+            font-size: 1.1rem;
+            list-style: none;
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+        }
+        summary::-webkit-details-marker { display: none; }
+        summary::after {
+            font-family: 'Material Icons';
+            content: 'expand_more';
+            transform: rotate(0deg);
+            transition: transform 0.2s ease-in-out;
+        }
+        details[open] > summary::after { transform: rotate(180deg); }
+        .details-content { padding: 0 1rem 1rem 1rem; border-top: 1px solid var(--border-color); }
+        pre {
+            background-color: var(--code-bg-color);
+            color: var(--code-text-color);
+            padding: 1.5rem 1rem 1rem 1rem;
+            border-radius: 6px;
+            overflow-x: auto;
+            font-size: 0.9em;
+            position: relative;
+        }
+        code { font-family: 'Courier New', Courier, monospace; }
+        .code-block-header { font-weight: bold; color: var(--subtle-text-color); margin-bottom: -0.5rem; margin-top: 1rem; }
+        .copy-btn { position: absolute; top: 10px; right: 10px; background-color: #4a505c; color: #fff; border: none; padding: 6px 10px; border-radius: 4px; cursor: pointer; opacity: 0.7; transition: opacity 0.2s, background-color 0.2s; }
+        pre:hover .copy-btn { opacity: 1; }
+        .copy-btn:hover { background-color: #6c7382; }
+        .copy-btn.copied { background-color: var(--primary-dark); }
+        .icon-placeholder { font-style: italic; color: #999; display: inline-block; margin-left: 8px; }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>The MLOps Engineer's Cheatsheet for Model Serving</h1>
+            <p>Select a framework or model type to see a practical guide for serving it—from local code to a production-grade, auto-scaling Kubernetes deployment.</p>
+        </header>
+        <main>
+            <!-- ======================= Classic ML Tiles ======================= -->
+            <h2 class="main-section-title"><i class="material-icons">model_training</i>Classic Machine Learning</h2>
+            <div class="tile-container">
+                <div class="tile active" data-target="classic-pytorch">
+                    <span class="material-icons">whatshot</span>
+                    <h4><!-- ICON PLACEHOLDER: PyTorch -->PyTorch</h4>
+                </div>
+                <div class="tile" data-target="classic-tensorflow">
+                    <span class="material-icons">hub</span>
+                    <h4><!-- ICON PLACEHOLDER: TensorFlow -->TensorFlow</h4>
+                </div>
+                <div class="tile" data-target="classic-sklearn">
+                    <span class="material-icons">data_object</span>
+                    <h4><!-- ICON PLACEHOLDER: Scikit-learn -->Scikit-learn</h4>
+                </div>
+                <div class="tile" data-target="classic-xgboost">
+                    <span class="material-icons">trending_up</span>
+                    <h4><!-- ICON PLACEHOLDER: XGBoost -->XGBoost</h4>
+                </div>
+                 <div class="tile" data-target="classic-jax">
+                    <span class="material-icons">functions</span>
+                    <h4><!-- ICON PLACEHOLDER: JAX -->JAX</h4>
+                </div>
+            </div>
+            <!-- ======================= Generative AI Tiles ======================= -->
+            <h2 class="main-section-title"><i class="material-icons">auto_awesome</i>Generative AI</h2>
+            <div class="tile-container">
+                <div class="tile" data-target="genai-llm">
+                    <span class="material-icons">chat</span>
+                    <h4>LLMs</h4>
+                </div>
+                <div class="tile" data-target="genai-vlm">
+                    <span class="material-icons">image_search</span>
+                    <h4>Multimodal (VLMs)</h4>
+                </div>
+                <div class="tile" data-target="genai-diffusion">
+                    <span class="material-icons">palette</span>
+                    <h4>Diffusion Models</h4>
+                </div>
+            </div>
+            <!-- ======================= Content Panels Container ======================= -->
+            <div class="content-container">
+                <!-- === PyTorch Content Panel === -->
+                <div id="classic-pytorch" class="content-panel active">
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
+                        <p>A simple feed-forward network defined in PyTorch. The model's `state_dict` is saved for deployment.</p>
+                        <p class="code-block-header">model_setup.py</p>
+<pre><code># model_setup.py
+import torch
+import torch.nn as nn
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.linear = nn.Linear(10, 1)
+    def forward(self, x):
+        return self.linear(x)
+model = SimpleNet()
+torch.save(model.state_dict(), "pytorch_model.pth")
+print("Model saved to pytorch_model.pth")</code></pre>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>Use a high-performance web framework like FastAPI for the API. For managed serving, KServe and Ray Serve offer powerful abstractions.</p>
+                        <details>
+                            <summary>Serve with FastAPI</summary>
+                            <div class="details-content">
+<pre><code># app.py
+from fastapi import FastAPI
+from pydantic import BaseModel
+import torch
+# ... (include SimpleNet class definition here) ...
+app = FastAPI(title="PyTorch Model Server")
+model = SimpleNet()
+model.load_state_dict(torch.load("pytorch_model.pth"))
+model.eval()
+class PredReq(BaseModel): data: list[float]
+@app.post("/predict")
+def predict(req: PredReq):
+    tensor = torch.tensor([req.data], dtype=torch.float32)
+    with torch.no_grad():
+        pred = model(tensor)
+    return {"prediction": pred.item()}
+</code></pre>
+                            </div>
+                        </details>
+                        <details>
+                            <summary>Serve with Ray Serve <!-- ICON PLACEHOLDER: Ray --></summary>
+                            <div class="details-content">
+<pre><code># ray_serve_app.py
+from ray import serve
+# ... (include FastAPI app, model class, etc.) ...
+@serve.deployment
+@serve.ingress(app)
+class ModelServer:
+    def __init__(self):
+        self.model = SimpleNet()
+        self.model.load_state_dict(torch.load("pytorch_model.pth"))
+        self.model.eval()
+    # FastAPI handles routing, this class just holds the model
+</code></pre>
+                            </div>
+                        </details>
+                        <details>
+                            <summary>Serve with KServe <!-- ICON PLACEHOLDER: Kubeflow --></summary>
+                            <div class="details-content">
+<pre><code># inferenceservice.yaml
+apiVersion: "serving.kserve.io/v1beta1"
+kind: "InferenceService"
+metadata:
+  name: "pytorch-model"
+spec:
+  predictor:
+    pytorch:
+      storageUri: "pvc://your-pvc/path/to/model-dir"
+</code></pre>
+                            </div>
+                        </details>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
+                        <p>Package the application with a multi-stage Dockerfile and define its runtime with Kubernetes Deployment, Service, and HPA objects.</p>
+                        <details>
+                            <summary>Dockerfile</summary>
+                            <div class="details-content">
+<pre><code># Multi-stage build for a lean final image
+FROM python:3.9-slim as builder
+WORKDIR /install
+RUN pip install --no-cache-dir --prefix="/install" torch fastapi "uvicorn[standard]"
+FROM python:3.9-slim
+WORKDIR /app
+COPY --from=builder /install /usr/local
+COPY ./app.py /app/
+COPY ./pytorch_model.pth /app/
+EXPOSE 8000
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
+</code></pre>
+                            </div>
+                        </details>
+                         <details>
+                            <summary>Deployment & Service YAML <!-- ICON PLACEHOLDER: Kubernetes --></summary>
+                            <div class="details-content">
+<pre><code># deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: pytorch-deployment
+spec:
+  replicas: 2
+  selector: { matchLabels: { app: pytorch } }
+  template:
+    metadata: { labels: { app: pytorch } }
+    spec:
+      containers:
+      - name: server
+        image: your-repo/pytorch-server:latest
+        ports:
+        - containerPort: 8000
+---
+# service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: pytorch-service
+spec:
+  type: NodePort
+  selector: { app: pytorch }
+  ports:
+  - port: 80
+    targetPort: 8000
+</code></pre>
+                            </div>
+                        </details>
+                         <details>
+                            <summary>Autoscaling (HPA) YAML</summary>
+                            <div class="details-content">
+<pre><code>apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: pytorch-hpa
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: pytorch-deployment
+  minReplicas: 1
+  maxReplicas: 5
+  metrics:
+  - type: Resource
+    resource: { name: cpu, target: { type: Utilization, averageUtilization: 80 } }
+</code></pre>
+                            </div>
+                        </details>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>CPUs:</strong> Suitable for small networks or where latency is not critical.<br><strong>GPUs:</strong> Essential for deep learning models to achieve low-latency inference. Use NVIDIA GPUs (T4, A10G, A100) for best performance with CUDA.<br><strong>TPUs:</strong> Best for massive-scale inference on Google Cloud, especially for models trained on TPUs.</p>
+                    </div>
+                </div>
+                <!-- === TensorFlow Content Panel === -->
+                <div id="classic-tensorflow" class="content-panel">
+                    <!-- Content for TensorFlow follows the same 4-layer structure -->
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
+                        <p>A simple Keras model saved in TensorFlow's `SavedModel` format, which bundles the architecture and weights.</p>
+                        <p class="code-block-header">model_setup.py</p>
+<pre><code># model_setup.py
+import tensorflow as tf
+model = tf.keras.Sequential([
+    tf.keras.layers.Dense(10, activation='relu', input_shape=(10,)),
+    tf.keras.layers.Dense(1)
+])
+model.save("tf_saved_model")
+print("Model saved to tf_saved_model/")</code></pre>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>FastAPI is a great choice for a custom Python server. KServe has native, high-performance support for the `SavedModel` format.</p>
+                         <details>
+                            <summary>Serve with FastAPI</summary>
+                            <div class="details-content">
+<pre><code># app.py
+from fastapi import FastAPI
+from pydantic import BaseModel
+import tensorflow as tf
+import numpy as np
+app = FastAPI(title="TensorFlow Model Server")
+model = tf.keras.models.load_model("tf_saved_model")
+class PredReq(BaseModel): data: list[float]
+@app.post("/predict")
+def predict(req: PredReq):
+    pred = model.predict(np.array([req.data]))
+    return {"prediction": pred.flatten().tolist()}
+</code></pre>
+                            </div>
+                        </details>
+                        <details>
+                            <summary>Serve with KServe <!-- ICON PLACEHOLDER: Kubeflow --></summary>
+                            <div class="details-content">
+<pre><code># inferenceservice.yaml
+apiVersion: "serving.kserve.io/v1beta1"
+kind: "InferenceService"
+metadata:
+  name: "tensorflow-model"
+spec:
+  predictor:
+    tensorflow:
+      storageUri: "s3://my-bucket/path/to/tf_saved_model"
+</code></pre>
+                            </div>
+                        </details>
+                    </div>
+                     <div class="stack-layer">
+                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
+                        <p>The Kubernetes configuration is very similar to the PyTorch example. Ensure your Dockerfile copies the entire `tf_saved_model` directory and installs the `tensorflow` library.</p>
+                    </div>
+                     <div class="stack-layer">
+                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>CPUs:</strong> Good for smaller Keras models. <br><strong>GPUs:</strong> Highly recommended for deep learning models. TensorFlow has excellent CUDA integration. <br><strong>TPUs:</strong> The premier choice for running TensorFlow models at scale, offering the best price/performance on GCP.</p>
+                    </div>
+                </div>
+                <!-- === Scikit-learn Content Panel === -->
+                <div id="classic-sklearn" class="content-panel">
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
+                        <p>A classic logistic regression model. Serialization is typically done with `joblib` for efficiency with NumPy structures.</p>
+                        <p class="code-block-header">model_setup.py</p>
+<pre><code># model_setup.py
+import joblib
+from sklearn.linear_model import LogisticRegression
+from sklearn.datasets import make_classification
+X, y = make_classification(n_features=4)
+model = LogisticRegression().fit(X, y)
+joblib.dump(model, "sklearn_model.joblib")
+print("Model saved to sklearn_model.joblib")</code></pre>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>FastAPI provides a simple and fast web server. KServe and Ray Serve also have native support for scikit-learn models.</p>
+                        <details>
+                            <summary>Serve with FastAPI</summary>
+                            <div class="details-content">
+<pre><code># app.py
+from fastapi import FastAPI
+from pydantic import BaseModel
+import joblib, numpy as np
+app = FastAPI(title="Scikit-learn Server")
+model = joblib.load("sklearn_model.joblib")
+class PredReq(BaseModel): data: list[list[float]]
+@app.post("/predict")
+def predict(req: PredReq):
+    pred = model.predict(np.array(req.data))
+    return {"prediction": pred.tolist()}
+</code></pre>
+                            </div>
+                        </details>
+                        <details>
+                            <summary>Serve with KServe <!-- ICON PLACEHOLDER: Kubeflow --></summary>
+                            <div class="details-content">
+<pre><code># inferenceservice.yaml
+apiVersion: "serving.kserve.io/v1beta1"
+kind: "InferenceService"
+metadata:
+  name: "sklearn-model"
+spec:
+  predictor:
+    sklearn:
+      storageUri: "pvc://my-pvc/path/to/model-dir" # must contain model.joblib
+</code></pre>
+                            </div>
+                        </details>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
+                        <p>Standard Kubernetes setup. The Docker container will be lightweight as it only needs `scikit-learn`, `joblib`, and `fastapi`.</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>CPUs:</strong> Almost always sufficient. Scikit-learn models are designed to run efficiently on CPUs.<br><strong>GPUs/TPUs:</strong> Not used. There is no GPU acceleration for standard scikit-learn algorithms.</p>
+                    </div>
+                </div>
+                <!-- === XGBoost Content Panel === -->
+                <div id="classic-xgboost" class="content-panel">
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
+                        <p>An XGBoost model saved in its native JSON format, which is portable and human-readable.</p>
+                        <p class="code-block-header">model_setup.py</p>
+<pre><code># model_setup.py
+import xgboost as xgb
+from sklearn.datasets import make_classification
+X, y = make_classification(n_features=4)
+dtrain = xgb.DMatrix(X, label=y)
+model = xgb.train({'objective':'binary:logistic'}, dtrain, 10)
+model.save_model("xgboost_model.json")
+print("Model saved to xgboost_model.json")</code></pre>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>KServe and Ray Serve both support XGBoost. A custom FastAPI server is also a robust option.</p>
+                        <details>
+                            <summary>Serve with FastAPI</summary>
+                            <div class="details-content">
+<pre><code># app.py
+from fastapi import FastAPI
+from pydantic import BaseModel
+import xgboost as xgb, numpy as np
+app = FastAPI(title="XGBoost Server")
+model = xgb.Booster()
+model.load_model("xgboost_model.json")
+class PredReq(BaseModel): data: list[list[float]]
+@app.post("/predict")
+def predict(req: PredReq):
+    dmatrix = xgb.DMatrix(np.array(req.data))
+    pred = model.predict(dmatrix)
+    return {"prediction": pred.tolist()}
+</code></pre>
+                            </div>
+                        </details>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
+                        <p>Standard Kubernetes setup. The Dockerfile should include the `xgboost` library.</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>CPUs:</strong> Excellent performance for most use cases.<br><strong>GPUs:</strong> XGBoost has optional GPU acceleration which can provide a significant speedup for large datasets and complex trees during inference.</p>
+                    </div>
+                </div>
+                <!-- === JAX Content Panel === -->
+                <div id="classic-jax" class="content-panel">
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
+                        <p>JAX models are often defined as pure functions with parameters handled separately. We save the parameters using NumPy.</p>
+                        <p class="code-block-header">model_setup.py</p>
+<pre><code># model_setup.py
+import jax
+import jax.numpy as jnp
+import numpy as np
+# A pure function for linear regression
+def predict_fn(params, inputs):
+    return jnp.dot(inputs, params['w']) + params['b']
+# Initialize and save dummy parameters
+key = jax.random.PRNGKey(0)
+params = {
+    'w': jax.random.normal(key, (10,)),
+    'b': jnp.array(0.0)
+}
+np.savez("jax_params.npz", **params)
+print("Parameters saved to jax_params.npz")</code></pre>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>Ray Serve is an excellent fit for JAX's functional paradigm. A custom FastAPI server is also straightforward. KServe requires a custom container.</p>
+                         <details>
+                            <summary>Serve with FastAPI</summary>
+                            <div class="details-content">
+<pre><code># app.py
+from fastapi import FastAPI
+from pydantic import BaseModel
+import jax, jax.numpy as jnp, numpy as np
+# Define predict function and JIT-compile it
+@jax.jit
+def predict_fn(params, inputs):
+    return jnp.dot(inputs, params['w']) + params['b']
+app = FastAPI(title="JAX Server")
+params = np.load("jax_params.npz")
+class PredReq(BaseModel): data: list[float]
+@app.post("/predict")
+def predict(req: PredReq):
+    pred = predict_fn(params, jnp.array(req.data))
+    return {"prediction": pred.tolist()}
+</code></pre>
+                            </div>
+                        </details>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
+                        <p>The Kubernetes configuration is standard. The Dockerfile needs to install `jax` and `jaxlib` corresponding to the target hardware (CPU or GPU).</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>CPUs:</strong> JAX is very fast on CPU.<br><strong>GPUs/TPUs:</strong> JAX was designed for accelerators and excels on GPUs and TPUs, often outperforming other frameworks due to its XLA-based compilation.</p>
+                    </div>
+                </div>
+                <!-- === LLM Content Panel === -->
+                <div id="genai-llm" class="content-panel">
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
+                        <p>Large Language Models (e.g., Llama, Mistral) are based on the Transformer architecture. The key inference challenge is managing the <strong>KV Cache</strong>, a stateful cache of attention keys and values that grows with every generated token and consumes massive amounts of VRAM.</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>Specialized serving toolkits are required for efficient LLM inference. These handle complex optimizations like continuous batching and KV cache management.</p>
+                        <ul>
+                            <li><strong>vLLM:</strong> A high-throughput serving engine using PagedAttention to optimize KV cache memory, drastically improving throughput.</li>
+                            <li><strong>Text Generation Inference (TGI):</strong> Hugging Face's production-ready solution with tensor parallelism and optimized kernels.</li>
+                            <li><strong>TensorRT-LLM:</strong> NVIDIA's library for compiling LLMs into highly optimized engines for NVIDIA GPUs.</li>
+                        </ul>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
+                        <p>Deployments must request large amounts of GPU resources (`nvidia.com/gpu: 1`) and memory. Node affinity and taints/tolerations are used to schedule pods onto specific GPU node pools (e.g., nodes with A100s).</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>GPUs:</strong> Essential. High-VRAM GPUs like NVIDIA A100 (40GB/80GB) or H100 (80GB) are required to fit the model weights and KV cache. Multiple GPUs are often needed for larger models via tensor parallelism.</p>
+                    </div>
+                </div>
+                <!-- === VLM Content Panel === -->
+                <div id="genai-vlm" class="content-panel">
+                     <div class="stack-layer">
+                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
+                        <p>Visual Large Models (e.g., LLaVA, CogVLM) combine a vision encoder (like ViT) with an LLM. They can process and reason about both images and text, making them powerful but complex to serve.</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>The serving stack must handle multi-modal inputs (e.g., base64-encoded images and text in a single JSON payload). Preprocessing the image into tensors is a key part of the serving logic. Frameworks like <strong>vLLM</strong> and <strong>SGLang</strong> are adding support for VLMs.</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
+                        <p>Similar to LLMs, VLM deployments require significant GPU and memory resources. The API server (e.g., FastAPI) must be configured to accept large request bodies to accommodate image data.</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>GPUs:</strong> High-VRAM GPUs are mandatory. The VRAM must accommodate the vision encoder, the LLM, and the KV cache, making memory requirements even higher than for a text-only LLM of a similar size.</p>
+                    </div>
+                </div>
+                <!-- === Diffusion Content Panel === -->
+                <div id="genai-diffusion" class="content-panel">
+                     <div class="stack-layer">
+                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
+                        <p>Diffusion models (e.g., Stable Diffusion) generate images through an iterative denoising process. Each step is a full forward pass through a large UNet model, making inference latency a major challenge.</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>Optimizing the serving stack focuses on reducing the number of inference steps and speeding up each step.</p>
+                        <ul>
+                            <li><strong>Model Compilation:</strong> Use tools like <strong>TensorRT</strong> or `torch.compile` to optimize the UNet and VAE components for the target GPU.</li>
+                            <li><strong>Latent Consistency Models (LCMs):</strong> A powerful distillation technique that allows for high-quality image generation in just 2-8 steps, drastically cutting latency.</li>
+                            <li><strong>Custom Pipelines:</strong> Tools like <strong>ComfyUI</strong> or <strong>Diffusers</strong> provide flexible pipelines that can be wrapped in a serving framework like FastAPI or Ray Serve.</li>
+                        </ul>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
+                        <p>Deployments must be stateful if caching compiled models or dealing with user-specific LoRAs. Persistent Volumes (PVCs) can be used to store these assets. Resource requests for GPU and VRAM are critical.</p>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>GPUs:</strong> High-end consumer (e.g., RTX 4090) or datacenter GPUs (A10G, A100) are needed for acceptable generation speeds. VRAM is the most critical resource, as it limits the output resolution and batch size.</p>
+                    </div>
+                </div>
+            </div>
+            <!-- ======================= Generic ML Optimization Section ======================= -->
+            <div id="optimizations" style="margin-top: 3rem;">
+                <h2 class="main-section-title"><i class="material-icons">speed</i>Generic ML Optimization</h2>
+                <div class="content-panel active">
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">dns</i>Optimize the Cluster</h3>
+                        <p>Tune the foundation for performance and cost.</p>
+                        <ul>
+                            <li><strong>Node Tuning:</strong> Use appropriate machine types (e.g., GPU nodes for DL, compute-optimized for CPU-bound tasks).</li>
+                            <li><strong>Cluster Autoscaling:</strong> Automatically add/remove nodes based on demand to save costs.</li>
+                            <li><strong>Network Policies:</strong> Secure inter-service communication within the cluster.</li>
+                        </ul>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">web</i>Optimize the Container & Server</h3>
+                        <p>Make the serving application itself as efficient as possible.</p>
+                        <ul>
+                            <li><strong>Efficient Web Server:</strong> Use ASGI servers (Uvicorn, Hypercorn) with FastAPI over WSGI (Flask) for better async performance.</li>
+                            <li><strong>Dynamic Batching:</strong> Group incoming requests into a single batch to maximize hardware utilization, especially on GPUs.</li>
+                            <li><strong>Lean Containers:</strong> Use multi-stage Docker builds to create small, secure production images.</li>
+                        </ul>
+                    </div>
+                    <div class="stack-layer">
+                        <h3><i class="material-icons">compress</i>Optimize the Model</h3>
+                        <p>Reduce model size and increase inference speed.</p>
+                        <ul>
+                            <li><strong>Quantization:</strong> Reduce model precision (e.g., FP32 to INT8/FP8) to shrink size and accelerate inference.</li>
+                            <li><strong>Pruning:</strong> Remove unnecessary weights from the model to create a smaller, faster "sparse" version.</li>
+                            <li><strong>Compilation:</strong> Use tools like TensorRT, OpenVINO, or JAX's JIT to compile the model into highly optimized, hardware-specific code.</li>
+                        </ul>
+                    </div>
+                </div>
+            </div>
+        </main>
+    </div>
+    <script>
+        document.addEventListener('DOMContentLoaded', function() {
+            const tiles = document.querySelectorAll('.tile');
+            const contentPanels = document.querySelectorAll('.content-panel');
+            // Function to switch active panels
+            function switchPanel(event) {
+                const targetId = event.currentTarget.dataset.target;
+                // Update tiles
+                tiles.forEach(tile => {
+                    tile.classList.remove('active');
+                });
+                event.currentTarget.classList.add('active');
+                // Update content panels
+                contentPanels.forEach(panel => {
+                    if (panel.id === targetId) {
+                        panel.classList.add('active');
+                    } else {
+                        // Only hide panels that are part of the tile system
+                        if (!panel.parentElement.id || panel.parentElement.id !== 'optimizations') {
+                           panel.classList.remove('active');
+                        }
+                    }
+                });
+            }
+            // Attach click listeners
+            tiles.forEach(tile => {
+                tile.addEventListener('click', switchPanel);
+            });
+            // Add copy buttons to all pre blocks
+            const preBlocks = document.querySelectorAll('pre');
+            preBlocks.forEach(pre => {
+                const code = pre.querySelector('code');
+                if (code) {
+                    const copyButton = document.createElement('button');
+                    copyButton.innerText = 'Copy';
+                    copyButton.className = 'copy-btn';
+                    copyButton.addEventListener('click', (e) => {
+                        e.stopPropagation(); // Prevent details/summary from toggling
+                        navigator.clipboard.writeText(code.innerText).then(() => {
+                            copyButton.innerText = 'Copied!';
+                            copyButton.classList.add('copied');
+                            setTimeout(() => {
+                                copyButton.innerText = 'Copy';
+                                copyButton.classList.remove('copied');
+                            }, 2000);
+                        }).catch(err => {
+                            console.error('Failed to copy text: ', err);
+                        });
+                    });
+                    pre.appendChild(copyButton);
+                }
+            });
+        });
+    </script>
+</body>
+</html>