Spaces:

fredmo
/

mlserving

Running

App Files Files Community

fredmo commited on Jun 10, 2025

Commit

4472750

verified ·

1 Parent(s): 13e6b23

Update index.html

Browse files

Files changed (1) hide show

index.html +391 -704

index.html CHANGED Viewed

@@ -3,7 +3,7 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>The MLOps Engineer's Cheatsheet for Model Serving</title>
     <link rel="preconnect" href="https://fonts.googleapis.com">
     <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
     <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" rel="stylesheet">
@@ -14,6 +14,7 @@
             --primary-color: #1E88E5; /* Blue */
             --primary-dark: #1565C0;
             --secondary-color: #004d40; /* Dark Teal for contrast */
             --background-color: #f4f6f8;
             --card-bg-color: #ffffff;
             --text-color: #333;
@@ -36,750 +37,324 @@
         }
         /* --- Layout & Containers --- */
-        .container {
-            max-width: 1200px;
-            margin: 0 auto;
-            padding: 2rem;
-        }
-        header {
-            text-align: center;
-            margin-bottom: 2rem;
-        }
-        header h1 {
-            color: var(--heading-color);
-            font-weight: 700;
-            font-size: 2.8rem;
-            margin-bottom: 0.5rem;
-        }
-        header p {
-            font-size: 1.1rem;
-            color: var(--subtle-text-color);
-            max-width: 800px;
-            margin: 0 auto;
-        }
         .main-section-title {
-            font-size: 2.2rem;
-            color: var(--heading-color);
-            border-bottom: 3px solid var(--primary-color);
-            padding-bottom: 0.75rem;
-            margin-top: 3rem;
-            margin-bottom: 2rem;
-            display: flex;
-            align-items: center;
-        }
-        .main-section-title .material-icons {
-             font-size: 2.8rem;
-             margin-right: 1rem;
-        }
-        /* --- Tile Navigation --- */
-        .tile-container {
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
-            gap: 1.5rem;
-            margin-bottom: 2.5rem;
         }
-        .tile {
-            background-color: var(--card-bg-color);
-            border: 2px solid var(--border-color);
-            border-radius: 8px;
-            padding: 1.5rem;
-            text-align: center;
-            cursor: pointer;
-            transition: transform 0.2s ease, box-shadow 0.2s ease, border-color 0.2s ease;
-            display: flex;
-            flex-direction: column;
-            align-items: center;
-            justify-content: center;
-            min-height: 150px;
-        }
-        .tile:hover {
-            transform: translateY(-5px);
-            box-shadow: var(--tile-hover-shadow);
-            border-color: var(--primary-color);
-        }
-        .tile.active {
-            border-color: var(--primary-color);
-            box-shadow: var(--tile-hover-shadow);
-            background-color: #f0f7ff;
-        }
-        .tile .material-icons {
-            font-size: 3rem;
-            color: var(--primary-color);
-            margin-bottom: 1rem;
         }
-        .tile h4 {
-            margin: 0;
-            font-size: 1.2rem;
-            color: var(--heading-color);
         }
-        /* --- Content Panels --- */
-        .content-panel {
-            display: none; /* Hidden by default, shown by JS */
-            background-color: var(--card-bg-color);
-            border-radius: 8px;
-            box-shadow: var(--shadow);
-            padding: 2.5rem;
-            margin-top: 1rem;
         }
-        .content-panel.active {
-            display: block;
         }
-        .stack-layer {
-            margin-bottom: 2.5rem;
-            padding-bottom: 1.5rem;
-            border-bottom: 1px solid var(--border-color);
         }
-        .stack-layer:last-child {
-            border-bottom: none;
-            margin-bottom: 0;
-        }
-        .stack-layer h3 {
-            font-size: 1.6rem;
-            color: var(--secondary-color);
-            margin-top: 0;
-            display: flex;
-            align-items: center;
-        }
-        .stack-layer h3 .material-icons {
-            margin-right: 12px;
-            font-size: 2rem;
-        }
-        /* --- Collapsible Sections & Code --- */
-        details {
-            border: 1px solid var(--border-color);
-            border-radius: 6px;
             margin-bottom: 1rem;
-            background-color: #f9fafb;
-            transition: background-color 0.2s ease-in-out;
-        }
-        details[open] { background-color: var(--card-bg-color); }
-        summary {
-            cursor: pointer;
-            padding: 1rem;
-            font-weight: 500;
-            font-size: 1.1rem;
-            list-style: none;
-            display: flex;
-            align-items: center;
-            justify-content: space-between;
-        }
-        summary::-webkit-details-marker { display: none; }
-        summary::after {
-            font-family: 'Material Icons';
-            content: 'expand_more';
-            transform: rotate(0deg);
-            transition: transform 0.2s ease-in-out;
-        }
-        details[open] > summary::after { transform: rotate(180deg); }
-        .details-content { padding: 0 1rem 1rem 1rem; border-top: 1px solid var(--border-color); }
-        pre {
-            background-color: var(--code-bg-color);
-            color: var(--code-text-color);
-            padding: 1.5rem 1rem 1rem 1rem;
-            border-radius: 6px;
-            overflow-x: auto;
-            font-size: 0.9em;
-            position: relative;
         }
         code { font-family: 'Courier New', Courier, monospace; }
-        .code-block-header { font-weight: bold; color: var(--subtle-text-color); margin-bottom: -0.5rem; margin-top: 1rem; }
-        .copy-btn { position: absolute; top: 10px; right: 10px; background-color: #4a505c; color: #fff; border: none; padding: 6px 10px; border-radius: 4px; cursor: pointer; opacity: 0.7; transition: opacity 0.2s, background-color 0.2s; }
         pre:hover .copy-btn { opacity: 1; }
-        .copy-btn:hover { background-color: #6c7382; }
         .copy-btn.copied { background-color: var(--primary-dark); }
-        .icon-placeholder { font-style: italic; color: #999; display: inline-block; margin-left: 8px; }
     </style>
 </head>
 <body>
     <div class="container">
         <header>
-            <h1>The MLOps Engineer's Cheatsheet for Model Serving</h1>
-            <p>Select a framework or model type to see a practical guide for serving it—from local code to a production-grade, auto-scaling Kubernetes deployment.</p>
         </header>
         <main>
-            <!-- ======================= Classic ML Tiles ======================= -->
-            <h2 class="main-section-title"><i class="material-icons">model_training</i>Classic Machine Learning</h2>
-            <div class="tile-container">
-                <div class="tile active" data-target="classic-pytorch">
-                    <span class="material-icons">whatshot</span>
-                    <h4><!-- ICON PLACEHOLDER: PyTorch -->PyTorch</h4>
-                </div>
-                <div class="tile" data-target="classic-tensorflow">
-                    <span class="material-icons">hub</span>
-                    <h4><!-- ICON PLACEHOLDER: TensorFlow -->TensorFlow</h4>
                 </div>
-                <div class="tile" data-target="classic-sklearn">
-                    <span class="material-icons">data_object</span>
-                    <h4><!-- ICON PLACEHOLDER: Scikit-learn -->Scikit-learn</h4>
-                </div>
-                <div class="tile" data-target="classic-xgboost">
-                    <span class="material-icons">trending_up</span>
-                    <h4><!-- ICON PLACEHOLDER: XGBoost -->XGBoost</h4>
                 </div>
-                 <div class="tile" data-target="classic-jax">
-                    <span class="material-icons">functions</span>
-                    <h4><!-- ICON PLACEHOLDER: JAX -->JAX</h4>
                 </div>
             </div>
-            <!-- ======================= Generative AI Tiles ======================= -->
-            <h2 class="main-section-title"><i class="material-icons">auto_awesome</i>Generative AI</h2>
             <div class="tile-container">
-                <div class="tile" data-target="genai-llm">
-                    <span class="material-icons">chat</span>
-                    <h4>LLMs</h4>
-                </div>
-                <div class="tile" data-target="genai-vlm">
-                    <span class="material-icons">image_search</span>
-                    <h4>Multimodal (VLMs)</h4>
-                </div>
-                <div class="tile" data-target="genai-diffusion">
-                    <span class="material-icons">palette</span>
-                    <h4>Diffusion Models</h4>
-                </div>
             </div>
-            <!-- ======================= Content Panels Container ======================= -->
-            <div class="content-container">
-                <!-- === PyTorch Content Panel === -->
-                <div id="classic-pytorch" class="content-panel active">
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
                         <p>A simple feed-forward network defined in PyTorch. The model's `state_dict` is saved for deployment.</p>
                         <p class="code-block-header">model_setup.py</p>
-<pre><code># model_setup.py
-import torch
 import torch.nn as nn
 class SimpleNet(nn.Module):
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.linear = nn.Linear(10, 1)
-    def forward(self, x):
-        return self.linear(x)
 model = SimpleNet()
-torch.save(model.state_dict(), "pytorch_model.pth")
-print("Model saved to pytorch_model.pth")</code></pre>
                     </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
-                        <p>Use a high-performance web framework like FastAPI for the API. For managed serving, KServe and Ray Serve offer powerful abstractions.</p>
-                        <details>
-                            <summary>Serve with FastAPI</summary>
-                            <div class="details-content">
-<pre><code># app.py
-from fastapi import FastAPI
-from pydantic import BaseModel
-import torch
-# ... (include SimpleNet class definition here) ...
-app = FastAPI(title="PyTorch Model Server")
-model = SimpleNet()
-model.load_state_dict(torch.load("pytorch_model.pth"))
-model.eval()
-class PredReq(BaseModel): data: list[float]
-@app.post("/predict")
-def predict(req: PredReq):
-    tensor = torch.tensor([req.data], dtype=torch.float32)
-    with torch.no_grad():
-        pred = model(tensor)
-    return {"prediction": pred.item()}
-</code></pre>
-                            </div>
-                        </details>
-                        <details>
-                            <summary>Serve with Ray Serve <!-- ICON PLACEHOLDER: Ray --></summary>
-                            <div class="details-content">
-<pre><code># ray_serve_app.py
-from ray import serve
-# ... (include FastAPI app, model class, etc.) ...
-@serve.deployment
-@serve.ingress(app)
-class ModelServer:
-    def __init__(self):
-        self.model = SimpleNet()
-        self.model.load_state_dict(torch.load("pytorch_model.pth"))
-        self.model.eval()
-    # FastAPI handles routing, this class just holds the model
-</code></pre>
-                            </div>
-                        </details>
-                        <details>
-                            <summary>Serve with KServe <!-- ICON PLACEHOLDER: Kubeflow --></summary>
-                            <div class="details-content">
-<pre><code># inferenceservice.yaml
-apiVersion: "serving.kserve.io/v1beta1"
-kind: "InferenceService"
-metadata:
-  name: "pytorch-model"
-spec:
-  predictor:
-    pytorch:
-      storageUri: "pvc://your-pvc/path/to/model-dir"
-</code></pre>
-                            </div>
-                        </details>
                     </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
-                        <p>Package the application with a multi-stage Dockerfile and define its runtime with Kubernetes Deployment, Service, and HPA objects.</p>
-                        <details>
-                            <summary>Dockerfile</summary>
-                            <div class="details-content">
-<pre><code># Multi-stage build for a lean final image
-FROM python:3.9-slim as builder
-WORKDIR /install
-RUN pip install --no-cache-dir --prefix="/install" torch fastapi "uvicorn[standard]"
-FROM python:3.9-slim
-WORKDIR /app
-COPY --from=builder /install /usr/local
-COPY ./app.py /app/
-COPY ./pytorch_model.pth /app/
-EXPOSE 8000
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
-</code></pre>
-                            </div>
-                        </details>
-                         <details>
-                            <summary>Deployment & Service YAML <!-- ICON PLACEHOLDER: Kubernetes --></summary>
-                            <div class="details-content">
-<pre><code># deployment.yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: pytorch-deployment
-spec:
-  replicas: 2
-  selector: { matchLabels: { app: pytorch } }
-  template:
-    metadata: { labels: { app: pytorch } }
-    spec:
-      containers:
-      - name: server
-        image: your-repo/pytorch-server:latest
-        ports:
-        - containerPort: 8000
----
-# service.yaml
-apiVersion: v1
-kind: Service
-metadata:
-  name: pytorch-service
-spec:
-  type: NodePort
-  selector: { app: pytorch }
-  ports:
-  - port: 80
-    targetPort: 8000
-</code></pre>
-                            </div>
-                        </details>
-                         <details>
-                            <summary>Autoscaling (HPA) YAML</summary>
-                            <div class="details-content">
-<pre><code>apiVersion: autoscaling/v2
-kind: HorizontalPodAutoscaler
-metadata:
-  name: pytorch-hpa
-spec:
-  scaleTargetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: pytorch-deployment
-  minReplicas: 1
-  maxReplicas: 5
-  metrics:
-  - type: Resource
-    resource: { name: cpu, target: { type: Utilization, averageUtilization: 80 } }
-</code></pre>
-                            </div>
-                        </details>
                     </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
-                        <p><strong>CPUs:</strong> Suitable for small networks or where latency is not critical.<br><strong>GPUs:</strong> Essential for deep learning models to achieve low-latency inference. Use NVIDIA GPUs (T4, A10G, A100) for best performance with CUDA.<br><strong>TPUs:</strong> Best for massive-scale inference on Google Cloud, especially for models trained on TPUs.</p>
                     </div>
                 </div>
-                <!-- === TensorFlow Content Panel === -->
                 <div id="classic-tensorflow" class="content-panel">
-                    <!-- Content for TensorFlow follows the same 4-layer structure -->
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
                         <p>A simple Keras model saved in TensorFlow's `SavedModel` format, which bundles the architecture and weights.</p>
                         <p class="code-block-header">model_setup.py</p>
-<pre><code># model_setup.py
-import tensorflow as tf
 model = tf.keras.Sequential([
     tf.keras.layers.Dense(10, activation='relu', input_shape=(10,)),
     tf.keras.layers.Dense(1)
 ])
-model.save("tf_saved_model")
-print("Model saved to tf_saved_model/")</code></pre>
                     </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
-                        <p>FastAPI is a great choice for a custom Python server. KServe has native, high-performance support for the `SavedModel` format.</p>
-                         <details>
-                            <summary>Serve with FastAPI</summary>
-                            <div class="details-content">
-<pre><code># app.py
-from fastapi import FastAPI
-from pydantic import BaseModel
-import tensorflow as tf
-import numpy as np
-app = FastAPI(title="TensorFlow Model Server")
-model = tf.keras.models.load_model("tf_saved_model")
-class PredReq(BaseModel): data: list[float]
-@app.post("/predict")
-def predict(req: PredReq):
-    pred = model.predict(np.array([req.data]))
-    return {"prediction": pred.flatten().tolist()}
-</code></pre>
-                            </div>
-                        </details>
-                        <details>
-                            <summary>Serve with KServe <!-- ICON PLACEHOLDER: Kubeflow --></summary>
-                            <div class="details-content">
-<pre><code># inferenceservice.yaml
-apiVersion: "serving.kserve.io/v1beta1"
-kind: "InferenceService"
-metadata:
-  name: "tensorflow-model"
-spec:
-  predictor:
-    tensorflow:
-      storageUri: "s3://my-bucket/path/to/tf_saved_model"
-</code></pre>
-                            </div>
-                        </details>
                     </div>
-                     <div class="stack-layer">
-                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
-                        <p>The Kubernetes configuration is very similar to the PyTorch example. Ensure your Dockerfile copies the entire `tf_saved_model` directory and installs the `tensorflow` library.</p>
                     </div>
-                     <div class="stack-layer">
-                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
-                        <p><strong>CPUs:</strong> Good for smaller Keras models. <br><strong>GPUs:</strong> Highly recommended for deep learning models. TensorFlow has excellent CUDA integration. <br><strong>TPUs:</strong> The premier choice for running TensorFlow models at scale, offering the best price/performance on GCP.</p>
                     </div>
                 </div>
-                <!-- === Scikit-learn Content Panel === -->
                 <div id="classic-sklearn" class="content-panel">
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
                         <p>A classic logistic regression model. Serialization is typically done with `joblib` for efficiency with NumPy structures.</p>
                         <p class="code-block-header">model_setup.py</p>
-<pre><code># model_setup.py
-import joblib
 from sklearn.linear_model import LogisticRegression
 from sklearn.datasets import make_classification
 X, y = make_classification(n_features=4)
 model = LogisticRegression().fit(X, y)
-joblib.dump(model, "sklearn_model.joblib")
-print("Model saved to sklearn_model.joblib")</code></pre>
                     </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
-                        <p>FastAPI provides a simple and fast web server. KServe and Ray Serve also have native support for scikit-learn models.</p>
-                        <details>
-                            <summary>Serve with FastAPI</summary>
-                            <div class="details-content">
-<pre><code># app.py
-from fastapi import FastAPI
-from pydantic import BaseModel
-import joblib, numpy as np
-app = FastAPI(title="Scikit-learn Server")
-model = joblib.load("sklearn_model.joblib")
-class PredReq(BaseModel): data: list[list[float]]
-@app.post("/predict")
-def predict(req: PredReq):
-    pred = model.predict(np.array(req.data))
-    return {"prediction": pred.tolist()}
-</code></pre>
-                            </div>
-                        </details>
-                        <details>
-                            <summary>Serve with KServe <!-- ICON PLACEHOLDER: Kubeflow --></summary>
-                            <div class="details-content">
-<pre><code># inferenceservice.yaml
-apiVersion: "serving.kserve.io/v1beta1"
-kind: "InferenceService"
-metadata:
-  name: "sklearn-model"
-spec:
-  predictor:
-    sklearn:
-      storageUri: "pvc://my-pvc/path/to/model-dir" # must contain model.joblib
-</code></pre>
-                            </div>
-                        </details>
                     </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
-                        <p>Standard Kubernetes setup. The Docker container will be lightweight as it only needs `scikit-learn`, `joblib`, and `fastapi`.</p>
                     </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
-                        <p><strong>CPUs:</strong> Almost always sufficient. Scikit-learn models are designed to run efficiently on CPUs.<br><strong>GPUs/TPUs:</strong> Not used. There is no GPU acceleration for standard scikit-learn algorithms.</p>
                     </div>
                 </div>
-                <!-- === XGBoost Content Panel === -->
                 <div id="classic-xgboost" class="content-panel">
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
-                        <p>An XGBoost model saved in its native JSON format, which is portable and human-readable.</p>
-                        <p class="code-block-header">model_setup.py</p>
-<pre><code># model_setup.py
-import xgboost as xgb
-from sklearn.datasets import make_classification
-X, y = make_classification(n_features=4)
-dtrain = xgb.DMatrix(X, label=y)
-model = xgb.train({'objective':'binary:logistic'}, dtrain, 10)
-model.save_model("xgboost_model.json")
-print("Model saved to xgboost_model.json")</code></pre>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
-                        <p>KServe and Ray Serve both support XGBoost. A custom FastAPI server is also a robust option.</p>
-                        <details>
-                            <summary>Serve with FastAPI</summary>
-                            <div class="details-content">
-<pre><code># app.py
-from fastapi import FastAPI
-from pydantic import BaseModel
-import xgboost as xgb, numpy as np
-app = FastAPI(title="XGBoost Server")
-model = xgb.Booster()
-model.load_model("xgboost_model.json")
-class PredReq(BaseModel): data: list[list[float]]
-@app.post("/predict")
-def predict(req: PredReq):
-    dmatrix = xgb.DMatrix(np.array(req.data))
-    pred = model.predict(dmatrix)
-    return {"prediction": pred.tolist()}
-</code></pre>
-                            </div>
-                        </details>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
-                        <p>Standard Kubernetes setup. The Dockerfile should include the `xgboost` library.</p>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
-                        <p><strong>CPUs:</strong> Excellent performance for most use cases.<br><strong>GPUs:</strong> XGBoost has optional GPU acceleration which can provide a significant speedup for large datasets and complex trees during inference.</p>
-                    </div>
                 </div>
-                <!-- === JAX Content Panel === -->
                 <div id="classic-jax" class="content-panel">
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
-                        <p>JAX models are often defined as pure functions with parameters handled separately. We save the parameters using NumPy.</p>
-                        <p class="code-block-header">model_setup.py</p>
-<pre><code># model_setup.py
-import jax
-import jax.numpy as jnp
-import numpy as np
-# A pure function for linear regression
-def predict_fn(params, inputs):
-    return jnp.dot(inputs, params['w']) + params['b']
-# Initialize and save dummy parameters
-key = jax.random.PRNGKey(0)
-params = {
-    'w': jax.random.normal(key, (10,)),
-    'b': jnp.array(0.0)
-}
-np.savez("jax_params.npz", **params)
-print("Parameters saved to jax_params.npz")</code></pre>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
-                        <p>Ray Serve is an excellent fit for JAX's functional paradigm. A custom FastAPI server is also straightforward. KServe requires a custom container.</p>
-                         <details>
-                            <summary>Serve with FastAPI</summary>
-                            <div class="details-content">
-<pre><code># app.py
-from fastapi import FastAPI
-from pydantic import BaseModel
-import jax, jax.numpy as jnp, numpy as np
-# Define predict function and JIT-compile it
-@jax.jit
-def predict_fn(params, inputs):
-    return jnp.dot(inputs, params['w']) + params['b']
-app = FastAPI(title="JAX Server")
-params = np.load("jax_params.npz")
-class PredReq(BaseModel): data: list[float]
-@app.post("/predict")
-def predict(req: PredReq):
-    pred = predict_fn(params, jnp.array(req.data))
-    return {"prediction": pred.tolist()}
-</code></pre>
-                            </div>
-                        </details>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
-                        <p>The Kubernetes configuration is standard. The Dockerfile needs to install `jax` and `jaxlib` corresponding to the target hardware (CPU or GPU).</p>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
-                        <p><strong>CPUs:</strong> JAX is very fast on CPU.<br><strong>GPUs/TPUs:</strong> JAX was designed for accelerators and excels on GPUs and TPUs, often outperforming other frameworks due to its XLA-based compilation.</p>
-                    </div>
                 </div>
-                <!-- === LLM Content Panel === -->
                 <div id="genai-llm" class="content-panel">
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
-                        <p>Large Language Models (e.g., Llama, Mistral) are based on the Transformer architecture. The key inference challenge is managing the <strong>KV Cache</strong>, a stateful cache of attention keys and values that grows with every generated token and consumes massive amounts of VRAM.</p>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
-                        <p>Specialized serving toolkits are required for efficient LLM inference. These handle complex optimizations like continuous batching and KV cache management.</p>
-                        <ul>
-                            <li><strong>vLLM:</strong> A high-throughput serving engine using PagedAttention to optimize KV cache memory, drastically improving throughput.</li>
-                            <li><strong>Text Generation Inference (TGI):</strong> Hugging Face's production-ready solution with tensor parallelism and optimized kernels.</li>
-                            <li><strong>TensorRT-LLM:</strong> NVIDIA's library for compiling LLMs into highly optimized engines for NVIDIA GPUs.</li>
-                        </ul>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
-                        <p>Deployments must request large amounts of GPU resources (`nvidia.com/gpu: 1`) and memory. Node affinity and taints/tolerations are used to schedule pods onto specific GPU node pools (e.g., nodes with A100s).</p>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
-                        <p><strong>GPUs:</strong> Essential. High-VRAM GPUs like NVIDIA A100 (40GB/80GB) or H100 (80GB) are required to fit the model weights and KV cache. Multiple GPUs are often needed for larger models via tensor parallelism.</p>
-                    </div>
                 </div>
-                <!-- === VLM Content Panel === -->
                 <div id="genai-vlm" class="content-panel">
-                     <div class="stack-layer">
-                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
-                        <p>Visual Large Models (e.g., LLaVA, CogVLM) combine a vision encoder (like ViT) with an LLM. They can process and reason about both images and text, making them powerful but complex to serve.</p>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
-                        <p>The serving stack must handle multi-modal inputs (e.g., base64-encoded images and text in a single JSON payload). Preprocessing the image into tensors is a key part of the serving logic. Frameworks like <strong>vLLM</strong> and <strong>SGLang</strong> are adding support for VLMs.</p>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
-                        <p>Similar to LLMs, VLM deployments require significant GPU and memory resources. The API server (e.g., FastAPI) must be configured to accept large request bodies to accommodate image data.</p>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
-                        <p><strong>GPUs:</strong> High-VRAM GPUs are mandatory. The VRAM must accommodate the vision encoder, the LLM, and the KV cache, making memory requirements even higher than for a text-only LLM of a similar size.</p>
-                    </div>
                 </div>
-                <!-- === Diffusion Content Panel === -->
                 <div id="genai-diffusion" class="content-panel">
-                     <div class="stack-layer">
-                        <h3><i class="material-icons">psychology</i>Model Layer</h3>
-                        <p>Diffusion models (e.g., Stable Diffusion) generate images through an iterative denoising process. Each step is a full forward pass through a large UNet model, making inference latency a major challenge.</p>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
-                        <p>Optimizing the serving stack focuses on reducing the number of inference steps and speeding up each step.</p>
-                        <ul>
-                            <li><strong>Model Compilation:</strong> Use tools like <strong>TensorRT</strong> or `torch.compile` to optimize the UNet and VAE components for the target GPU.</li>
-                            <li><strong>Latent Consistency Models (LCMs):</strong> A powerful distillation technique that allows for high-quality image generation in just 2-8 steps, drastically cutting latency.</li>
-                            <li><strong>Custom Pipelines:</strong> Tools like <strong>ComfyUI</strong> or <strong>Diffusers</strong> provide flexible pipelines that can be wrapped in a serving framework like FastAPI or Ray Serve.</li>
-                        </ul>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">cloud_queue</i>Kubernetes Layer</h3>
-                        <p>Deployments must be stateful if caching compiled models or dealing with user-specific LoRAs. Persistent Volumes (PVCs) can be used to store these assets. Resource requests for GPU and VRAM are critical.</p>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">memory</i>Hardware Layer</h3>
-                        <p><strong>GPUs:</strong> High-end consumer (e.g., RTX 4090) or datacenter GPUs (A10G, A100) are needed for acceptable generation speeds. VRAM is the most critical resource, as it limits the output resolution and batch size.</p>
-                    </div>
-                </div>
-            </div>
-            <!-- ======================= Generic ML Optimization Section ======================= -->
-            <div id="optimizations" style="margin-top: 3rem;">
-                <h2 class="main-section-title"><i class="material-icons">speed</i>Generic ML Optimization</h2>
-                <div class="content-panel active">
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">dns</i>Optimize the Cluster</h3>
-                        <p>Tune the foundation for performance and cost.</p>
-                        <ul>
-                            <li><strong>Node Tuning:</strong> Use appropriate machine types (e.g., GPU nodes for DL, compute-optimized for CPU-bound tasks).</li>
-                            <li><strong>Cluster Autoscaling:</strong> Automatically add/remove nodes based on demand to save costs.</li>
-                            <li><strong>Network Policies:</strong> Secure inter-service communication within the cluster.</li>
-                        </ul>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">web</i>Optimize the Container & Server</h3>
-                        <p>Make the serving application itself as efficient as possible.</p>
-                        <ul>
-                            <li><strong>Efficient Web Server:</strong> Use ASGI servers (Uvicorn, Hypercorn) with FastAPI over WSGI (Flask) for better async performance.</li>
-                            <li><strong>Dynamic Batching:</strong> Group incoming requests into a single batch to maximize hardware utilization, especially on GPUs.</li>
-                            <li><strong>Lean Containers:</strong> Use multi-stage Docker builds to create small, secure production images.</li>
-                        </ul>
-                    </div>
-                    <div class="stack-layer">
-                        <h3><i class="material-icons">compress</i>Optimize the Model</h3>
-                        <p>Reduce model size and increase inference speed.</p>
-                        <ul>
-                            <li><strong>Quantization:</strong> Reduce model precision (e.g., FP32 to INT8/FP8) to shrink size and accelerate inference.</li>
-                            <li><strong>Pruning:</strong> Remove unnecessary weights from the model to create a smaller, faster "sparse" version.</li>
-                            <li><strong>Compilation:</strong> Use tools like TensorRT, OpenVINO, or JAX's JIT to compile the model into highly optimized, hardware-specific code.</li>
-                        </ul>
-                    </div>
                 </div>
             </div>
         </main>
@@ -787,63 +362,175 @@ def predict(req: PredReq):
     <script>
         document.addEventListener('DOMContentLoaded', function() {
-            const tiles = document.querySelectorAll('.tile');
-            const contentPanels = document.querySelectorAll('.content-panel');
-            // Function to switch active panels
-            function switchPanel(event) {
-                const targetId = event.currentTarget.dataset.target;
-                // Update tiles
-                tiles.forEach(tile => {
-                    tile.classList.remove('active');
-                });
-                event.currentTarget.classList.add('active');
-                // Update content panels
-                contentPanels.forEach(panel => {
-                    if (panel.id === targetId) {
-                        panel.classList.add('active');
                     } else {
-                        // Only hide panels that are part of the tile system
-                        if (!panel.parentElement.id || panel.parentElement.id !== 'optimizations') {
-                           panel.classList.remove('active');
-                        }
                     }
                 });
-            }
-            // Attach click listeners
-            tiles.forEach(tile => {
-                tile.addEventListener('click', switchPanel);
             });
-            // Add copy buttons to all pre blocks
-            const preBlocks = document.querySelectorAll('pre');
-            preBlocks.forEach(pre => {
-                const code = pre.querySelector('code');
-                if (code) {
                     const copyButton = document.createElement('button');
                     copyButton.innerText = 'Copy';
                     copyButton.className = 'copy-btn';
                     copyButton.addEventListener('click', (e) => {
-                        e.stopPropagation(); // Prevent details/summary from toggling
-                        navigator.clipboard.writeText(code.innerText).then(() => {
                             copyButton.innerText = 'Copied!';
                             copyButton.classList.add('copied');
-                            setTimeout(() => {
-                                copyButton.innerText = 'Copy';
-                                copyButton.classList.remove('copied');
-                            }, 2000);
-                        }).catch(err => {
-                            console.error('Failed to copy text: ', err);
                         });
                     });
-                    pre.appendChild(copyButton);
                 }
             });
         });
     </script>
 </body>

 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>The MLOps Engineer's Interactive Architecture Builder</title>
     <link rel="preconnect" href="https://fonts.googleapis.com">
     <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
     <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" rel="stylesheet">
             --primary-color: #1E88E5; /* Blue */
             --primary-dark: #1565C0;
             --secondary-color: #004d40; /* Dark Teal for contrast */
+            --genai-color: #6A1B9A; /* Purple for Gen AI */
             --background-color: #f4f6f8;
             --card-bg-color: #ffffff;
             --text-color: #333;
         }
         /* --- Layout & Containers --- */
+        .container { max-width: 1200px; margin: 0 auto; padding: 2rem; }
+        header { text-align: center; margin-bottom: 2rem; }
+        header h1 { color: var(--heading-color); font-weight: 700; font-size: 2.8rem; margin-bottom: 0.5rem; }
+        header p { font-size: 1.1rem; color: var(--subtle-text-color); max-width: 800px; margin: 0 auto; }
         .main-section-title {
+            font-size: 2.2rem; color: var(--heading-color); border-bottom: 3px solid var(--primary-color);
+            padding-bottom: 0.75rem; margin-top: 3rem; margin-bottom: 2rem; display: flex; align-items: center;
         }
+        .main-section-title .material-icons { font-size: 2.8rem; margin-right: 1rem; }
+        /* --- Architecture Builder --- */
+        #architecture-builder { background-color: var(--card-bg-color); padding: 2rem; border-radius: 8px; box-shadow: var(--shadow); }
+        .arch-type-selector { display: flex; gap: 1rem; margin-bottom: 2rem; border-bottom: 1px solid var(--border-color); padding-bottom: 1.5rem; }
+        .arch-type-chip { padding: 0.8rem 1.5rem; border-radius: 8px; cursor: pointer; font-weight: 500; font-size: 1.1rem; border: 2px solid transparent; transition: all 0.2s ease; }
+        .arch-type-chip.active.classic { background-color: #e3f2fd; border-color: var(--primary-color); color: var(--primary-dark); }
+        .arch-type-chip.active.gen-ai { background-color: #f3e5f5; border-color: var(--genai-color); color: var(--genai-color); }
+        .builder-fields { display: none; }
+        .builder-fields.active { display: block; }
+        .selection-group { margin-bottom: 1.5rem; transition: opacity 0.3s ease; }
+        .selection-group.disabled { opacity: 0.5; pointer-events: none; }
+        .selection-group h4 { margin-top: 0; margin-bottom: 1rem; font-size: 1.2rem; color: var(--secondary-color); }
+        .selection-chips { display: flex; flex-wrap: wrap; gap: 0.75rem; }
+        .chip {
+            padding: 0.6rem 1.2rem; border: 2px solid var(--border-color); border-radius: 20px;
+            cursor: pointer; transition: all 0.2s ease; font-weight: 500; background-color: #f9f9f9;
         }
+        .chip:not(.disabled):hover { border-color: var(--primary-dark); background-color: #e3f2fd; }
+        .chip.active { background-color: var(--primary-color); color: white; border-color: var(--primary-color); }
+        .chip.disabled { opacity: 0.6; cursor: not-allowed; background-color: #f0f0f0; border-color: var(--border-color); color: #999; }
+        #generate-btn {
+            background-color: var(--secondary-color); color: white; border: none; padding: 0.8rem 2rem; font-size: 1.1rem;
+            font-weight: 500; border-radius: 6px; cursor: pointer; transition: background-color 0.2s;
+            display: block; margin-top: 2rem; width: 100%;
         }
+        #generate-btn:hover { background-color: #00695C; }
+        /* --- Architecture Diagram Output --- */
+        #architecture-diagram-output {
+            display: none; margin-top: 2rem; background-color: #fdfdfd; border: 1px solid var(--border-color);
+            padding: 2rem; border-radius: 8px; text-align: center;
         }
+        .diagram-title { font-size: 1.5rem; font-weight: 500; margin-bottom: 2rem; }
+        .diagram-stack { display: flex; flex-direction: column; align-items: center; gap: 0.5rem; }
+        .diagram-layer {
+            background-color: var(--card-bg-color); border: 2px solid var(--primary-color); border-radius: 8px;
+            padding: 1.5rem 2.5rem; width: 80%; max-width: 500px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); text-align: center;
         }
+        .diagram-layer.gen-ai-layer { border-color: var(--genai-color); }
+        .diagram-layer.gen-ai-layer h5 { color: var(--genai-color); }
+        .diagram-layer h5 { margin: 0 0 0.5rem 0; color: var(--primary-dark); font-size: 1.2rem; font-weight: 700; }
+        .diagram-layer p { margin: 0; font-size: 1rem; color: var(--subtle-text-color); }
+        .diagram-arrow { font-family: 'Material Icons'; font-size: 2.5rem; color: var(--primary-color); line-height: 1; }
+        .diagram-arrow.gen-ai-arrow { color: var(--genai-color); }
+        .icon-img-placeholder {
+            height: 32px;
+            max-width: 120px;
+            width: auto;
+            margin-top: 10px;
         }
+        /* --- Reference Tiles and Panels, Code & Details --- */
+        .tile-container { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 1.5rem; margin-bottom: 2.5rem; }
+        .tile { background-color: var(--card-bg-color); border: 2px solid var(--border-color); border-radius: 8px; padding: 1.5rem; text-align: center; cursor: pointer; transition: all 0.2s ease; display: flex; flex-direction: column; align-items: center; justify-content: center; min-height: 150px; }
+        .tile:hover { transform: translateY(-5px); box-shadow: var(--tile-hover-shadow); border-color: var(--primary-color); }
+        .tile.active { border-color: var(--primary-color); box-shadow: var(--tile-hover-shadow); background-color: #f0f7ff; }
+        .tile-icon-img {
+            height: 48px;
+            width: auto;
+            max-width: 100%;
             margin-bottom: 1rem;
         }
+        .tile h4 { margin: 0; font-size: 1.2rem; color: var(--heading-color); }
+        .content-panel { display: none; background-color: var(--card-bg-color); border-radius: 8px; box-shadow: var(--shadow); padding: 2.5rem; margin-top: 1rem; }
+        .content-panel.active { display: block; }
+        .stack-layer { margin-bottom: 2.5rem; padding-bottom: 1.5rem; border-bottom: 1px solid var(--border-color); }
+        .stack-layer:last-child { border-bottom: none; margin-bottom: 0; }
+        .stack-layer h3 { font-size: 1.6rem; color: var(--secondary-color); margin-top: 0; display: flex; align-items: center; }
+        .stack-layer h3 .material-icons { margin-right: 12px; font-size: 2rem; }
+        details { border: 1px solid var(--border-color); border-radius: 6px; margin-bottom: 1rem; background-color: #f9fafb; }
+        summary { cursor: pointer; padding: 1rem; font-weight: 500; font-size: 1.1rem; list-style: none; display: flex; align-items: center; justify-content: space-between; }
+        pre { background-color: var(--code-bg-color); color: var(--code-text-color); padding: 1.5rem 1rem 1rem 1rem; border-radius: 6px; overflow-x: auto; font-size: 0.9em; position: relative; }
         code { font-family: 'Courier New', Courier, monospace; }
+        .copy-btn { position: absolute; top: 10px; right: 10px; background-color: #4a505c; color: #fff; border: none; padding: 6px 10px; border-radius: 4px; cursor: pointer; opacity: 0.7; }
         pre:hover .copy-btn { opacity: 1; }
         .copy-btn.copied { background-color: var(--primary-dark); }
+        .code-block-header { font-weight: bold; color: var(--subtle-text-color); margin-bottom: -0.5rem; margin-top: 1rem; }
     </style>
 </head>
 <body>
     <div class="container">
         <header>
+            <h1>MLOps Architecture Builder & Cheatsheet</h1>
+            <p>Design your custom model serving stack using the builder below, or explore detailed deployment guides for common frameworks.</p>
         </header>
         <main>
+            <!-- ======================= My Architecture Builder ======================= -->
+            <h2 class="main-section-title"><i class="material-icons">architecture</i>My Architecture</h2>
+            <div id="architecture-builder">
+                <div class="arch-type-selector">
+                    <div class="arch-type-chip active classic" data-type="classic">Classic ML</div>
+                    <div class="arch-type-chip gen-ai" data-type="gen-ai">Generative AI</div>
                 </div>
+                <!-- Classic Builder Fields -->
+                <div id="classic-builder-fields" class="builder-fields active">
+                    <div class="selection-group" data-group="framework">
+                        <h4>1. ML Framework</h4>
+                        <div class="selection-chips">
+                            <div class="chip" data-id="scikit-learn">Scikit-learn</div>
+                            <div class="chip" data-id="xgboost">XGBoost</div>
+                            <div class="chip" data-id="pytorch">PyTorch</div>
+                            <div class="chip" data-id="tensorflow">TensorFlow</div>
+                            <div class="chip" data-id="jax">JAX</div>
+                            <div class="chip" data-id="keras">Keras</div>
+                        </div>
+                    </div>
+                    <div class="selection-group" data-group="serving">
+                        <h4>2. Serving Container</h4>
+                        <div class="selection-chips">
+                            <div class="chip" data-id="kserve">Kubeflow KServe</div>
+                            <div class="chip" data-id="ray-serve">Ray Serve</div>
+                            <div class="chip" data-id="torchserve">TorchServe</div>
+                            <div class="chip" data-id="tf-serving">TF Serving</div>
+                            <div class="chip" data-id="triton">NVIDIA Triton</div>
+                            <div class="chip" data-id="custom">Custom Container (FastAPI)</div>
+                        </div>
+                    </div>
+                    <div class="selection-group" data-group="orchestration">
+                        <h4>3. Orchestration / Platform</h4>
+                        <div class="selection-chips">
+                            <div class="chip active" data-id="kubernetes">Kubernetes</div>
+                            <div class="chip" data-id="vertex-ai">Managed: Vertex AI</div>
+                            <div class="chip" data-id="sagemaker">Managed: SageMaker</div>
+                        </div>
+                    </div>
+                    <div class="selection-group" data-group="hardware">
+                        <h4>4. Hardware</h4>
+                        <div class="selection-chips">
+                            <div class="chip" data-id="vm">VMs (CPU)</div>
+                            <div class="chip" data-id="gpu">GPU</div>
+                            <div class="chip" data-id="tpu">TPU</div>
+                        </div>
+                    </div>
                 </div>
+                <!-- Gen AI Builder Fields -->
+                <div id="genai-builder-fields" class="builder-fields">
+                    <div class="selection-group" data-group="model-type">
+                        <h4>0. Model Type</h4>
+                        <div class="selection-chips">
+                            <div class="chip" data-id="llm">LLM</div>
+                            <div class="chip" data-id="vlm">Multimodal LLM (VLM)</div>
+                            <div class="chip" data-id="diffusion">Diffusion</div>
+                        </div>
+                    </div>
+                    <div class="selection-group" data-group="framework">
+                        <h4>1. ML Framework</h4>
+                        <div class="selection-chips">
+                            <div class="chip" data-id="pytorch">PyTorch</div>
+                            <div class="chip" data-id="tensorflow">TensorFlow</div>
+                            <div class="chip" data-id="jax">JAX</div>
+                            <div class="chip" data-id="keras">Keras</div>
+                        </div>
+                    </div>
+                    <div class="selection-group" data-group="serving">
+                        <h4>2. Serving Container</h4>
+                        <div class="selection-chips">
+                            <div class="chip" data-id="vllm">vLLM</div>
+                            <div class="chip" data-id="sglang">SGLang</div>
+                            <div class="chip" data-id="triton-trt-llm">NVIDIA Triton (TensorRT-LLM)</div>
+                            <div class="chip" data-id="custom">Custom Container (Diffusers, etc.)</div>
+                        </div>
+                    </div>
+                    <div class="selection-group" data-group="orchestration">
+                        <h4>3. Orchestration / Platform</h4>
+                        <div class="selection-chips">
+                            <div class="chip active" data-id="k8s-ray-kf">Kubernetes (KubeRay/Kubeflow)</div>
+                            <div class="chip" data-id="vertex-ai">Managed: Vertex AI</div>
+                            <div class="chip" data-id="sagemaker">Managed: SageMaker</div>
+                        </div>
+                    </div>
+                    <div class="selection-group" data-group="hardware">
+                        <h4>4. Hardware</h4>
+                        <div class="selection-chips">
+                            <div class="chip" data-id="gpu">GPU</div>
+                            <div class="chip" data-id="tpu">TPU</div>
+                        </div>
+                    </div>
                 </div>
+                <button id="generate-btn">Generate Architecture Diagram</button>
             </div>
+            <div id="architecture-diagram-output"></div>
+            <h2 class="main-section-title"><i class="material-icons">menu_book</i>Reference Guides</h2>
+            <h3 class="main-section-title" style="font-size: 1.8rem; border-color: var(--primary-color);"><i class="material-icons" style="color: var(--primary-color);">model_training</i>Classic ML</h3>
             <div class="tile-container">
+                <div class="tile" data-target="classic-pytorch"><img src="pytorch.png" class="tile-icon-img" alt="PyTorch Icon"><h4>PyTorch</h4></div>
+                <div class="tile" data-target="classic-tensorflow"><img src="tensorflow.png" class="tile-icon-img" alt="TensorFlow Icon"><h4>TensorFlow</h4></div>
+                <div class="tile" data-target="classic-sklearn"><img src="scikit-learn.png" class="tile-icon-img" alt="Scikit-learn Icon"><h4>Scikit-learn</h4></div>
+                <div class="tile" data-target="classic-xgboost"><img src="xgboost.png" class="tile-icon-img" alt="XGBoost Icon"><h4>XGBoost</h4></div>
+                 <div class="tile" data-target="classic-jax"><img src="jax.png" class="tile-icon-img" alt="JAX Icon"><h4>JAX</h4></div>
             </div>
+            <h3 class="main-section-title" style="font-size: 1.8rem; border-color: var(--genai-color);"><i class="material-icons" style="color: var(--genai-color);">auto_awesome</i>Generative AI</h3>
+            <div class="tile-container">
+                <div class="tile" data-target="genai-llm"><img src="llm.png" class="tile-icon-img" alt="LLM Icon"><h4>LLMs</h4></div>
+                <div class="tile" data-target="genai-vlm"><img src="vlm.png" class="tile-icon-img" alt="VLM Icon"><h4>Multimodal (VLMs)</h4></div>
+                <div class="tile" data-target="genai-diffusion"><img src="diffusion.png" class="tile-icon-img" alt="Diffusion Icon"><h4>Diffusion Models</h4></div>
+            </div>
+            <div class="content-container">
+                <!-- Classic ML Panels -->
+                <div id="classic-pytorch" class="content-panel">
+                    <div class="stack-layer"><h3><i class="material-icons">psychology</i>Model Layer</h3>
                         <p>A simple feed-forward network defined in PyTorch. The model's `state_dict` is saved for deployment.</p>
                         <p class="code-block-header">model_setup.py</p>
+<pre><code>import torch
 import torch.nn as nn
 class SimpleNet(nn.Module):
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.linear = nn.Linear(10, 1)
+    def forward(self, x): return self.linear(x)
 model = SimpleNet()
+torch.save(model.state_dict(), "pytorch_model.pth")</code></pre>
                     </div>
+                    <div class="stack-layer"><h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>Use a high-performance framework like FastAPI for a custom server. For dedicated solutions, TorchServe is the native choice, while Kubeflow KServe, Ray Serve, and NVIDIA Triton offer powerful, managed abstractions.</p>
                     </div>
+                    <div class="stack-layer"><h3><i class="material-icons">cloud_queue</i>Orchestration Layer</h3>
+                        <p>Package the application with a multi-stage Dockerfile and define its runtime with Kubernetes Deployment, Service, and HPA objects. Managed platforms like Vertex AI abstract this away.</p>
                     </div>
+                    <div class="stack-layer"><h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>CPUs:</strong> Suitable for small networks. <strong>GPUs:</strong> Essential for deep learning models. <strong>TPUs:</strong> Best for massive-scale inference on GCP.</p>
                     </div>
                 </div>
                 <div id="classic-tensorflow" class="content-panel">
+                    <div class="stack-layer"><h3><i class="material-icons">psychology</i>Model Layer</h3>
                         <p>A simple Keras model saved in TensorFlow's `SavedModel` format, which bundles the architecture and weights.</p>
                         <p class="code-block-header">model_setup.py</p>
+<pre><code>import tensorflow as tf
 model = tf.keras.Sequential([
     tf.keras.layers.Dense(10, activation='relu', input_shape=(10,)),
     tf.keras.layers.Dense(1)
 ])
+model.save("tf_saved_model")</code></pre>
                     </div>
+                    <div class="stack-layer"><h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>TF Serving and Kubeflow KServe offer native, high-performance support for the `SavedModel` format. NVIDIA Triton is also highly optimized for TF models. A custom FastAPI server is another flexible option.</p>
                     </div>
+                     <div class="stack-layer"><h3><i class="material-icons">cloud_queue</i>Orchestration Layer</h3>
+                        <p>The Kubernetes configuration is very similar to other frameworks. Ensure your Dockerfile copies the entire `tf_saved_model` directory.</p>
                     </div>
+                     <div class="stack-layer"><h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>CPUs:</strong> Good for smaller Keras models. <strong>GPUs:</strong> Highly recommended for deep learning models. <strong>TPUs:</strong> The premier choice for running TensorFlow models at scale on GCP.</p>
                     </div>
                 </div>
                 <div id="classic-sklearn" class="content-panel">
+                    <div class="stack-layer"><h3><i class="material-icons">psychology</i>Model Layer</h3>
                         <p>A classic logistic regression model. Serialization is typically done with `joblib` for efficiency with NumPy structures.</p>
                         <p class="code-block-header">model_setup.py</p>
+<pre><code>import joblib
 from sklearn.linear_model import LogisticRegression
 from sklearn.datasets import make_classification
 X, y = make_classification(n_features=4)
 model = LogisticRegression().fit(X, y)
+joblib.dump(model, "sklearn_model.joblib")</code></pre>
                     </div>
+                    <div class="stack-layer"><h3><i class="material-icons">layers</i>Serving Stack Layer</h3>
+                        <p>FastAPI provides a simple and fast web server. Kubeflow KServe and Ray Serve also have native support for scikit-learn models. NVIDIA Triton is an option for CPU-optimized execution using its FIL backend.</p>
                     </div>
+                    <div class="stack-layer"><h3><i class="material-icons">cloud_queue</i>Orchestration Layer</h3>
+                        <p>Standard Kubernetes setup. The Docker container will be lightweight as it only needs `scikit-learn`, `joblib`, and `fastapi` for a custom server.</p>
                     </div>
+                    <div class="stack-layer"><h3><i class="material-icons">memory</i>Hardware Layer</h3>
+                        <p><strong>CPUs:</strong> Almost always sufficient. There is no GPU acceleration for standard scikit-learn algorithms.</p>
                     </div>
                 </div>
                 <div id="classic-xgboost" class="content-panel">
+                    <div class="stack-layer"><h3><i class="material-icons">psychology</i>Model Layer</h3><p>An XGBoost model saved in its native JSON or UBJ format, which is portable and efficient.</p></div>
+                    <div class="stack-layer"><h3><i class="material-icons">layers</i>Serving Stack Layer</h3><p>Kubeflow KServe, Ray Serve, NVIDIA Triton (with FIL backend), and custom FastAPI servers are all excellent choices.</p></div>
+                    <div class="stack-layer"><h3><i class="material-icons">cloud_queue</i>Orchestration Layer</h3><p>Standard Kubernetes setup. The Dockerfile should include the `xgboost` library.</p></div>
+                    <div class="stack-layer"><h3><i class="material-icons">memory</i>Hardware Layer</h3><p><strong>CPUs:</strong> Excellent performance. <strong>GPUs:</strong> XGBoost has optional GPU acceleration which can provide a significant speedup.</p></div>
                 </div>
                 <div id="classic-jax" class="content-panel">
+                    <div class="stack-layer"><h3><i class="material-icons">psychology</i>Model Layer</h3><p>JAX models are often defined as pure functions with parameters handled separately. We save the parameters using a standard serialization library like Flax's `msgpack`.</p></div>
+                    <div class="stack-layer"><h3><i class="material-icons">layers</i>Serving Stack Layer</h3><p>Ray Serve is an excellent fit for JAX's functional paradigm. A custom FastAPI server is also straightforward. Kubeflow KServe and NVIDIA Triton require a custom container approach wrapping the JAX logic.</p></div>
+                    <div class="stack-layer"><h3><i class="material-icons">cloud_queue</i>Orchestration Layer</h3><p>The Dockerfile needs to install `jax` and `jaxlib` corresponding to the target hardware (CPU, GPU, or TPU).</p></div>
+                    <div class="stack-layer"><h3><i class="material-icons">memory</i>Hardware Layer</h3><p><strong>CPUs/GPUs/TPUs:</strong> JAX was designed for accelerators and excels on all of them due to its XLA-based compilation.</p></div>
                 </div>
+                <!-- Gen AI Panels -->
                 <div id="genai-llm" class="content-panel">
+                    <div class="stack-layer"><h3><i class="material-icons">psychology</i>Model Layer</h3><p>Large Language Models (e.g., Llama, Mistral) are based on the Transformer architecture. The key inference challenge is managing the <strong>KV Cache</strong>.</p></div>
+                    <div class="stack-layer"><h3><i class="material-icons">layers</i>Serving Stack Layer</h3><p>Specialized serving toolkits like <strong>vLLM</strong>, <strong>SGLang</strong>, or <strong>NVIDIA Triton</strong> with its TensorRT-LLM backend are required for efficient inference, handling complexities like continuous batching and paged attention.</p></div>
+                    <div class="stack-layer"><h3><i class="material-icons">cloud_queue</i>Orchestration Layer</h3><p>Kubernetes (often with KubeRay) is used to manage GPU resources and schedule serving pods. Managed services like Vertex AI and SageMaker also provide optimized runtimes for popular LLMs.</p></div>
+                    <div class="stack-layer"><h3><i class="material-icons">memory</i>Hardware Layer</h3><p><strong>GPUs:</strong> Essential. High-VRAM GPUs like NVIDIA A100 or H100 are required to fit the model weights and KV cache. <strong>TPUs:</strong> Viable for specific models, especially on GCP.</p></div>
                 </div>
                 <div id="genai-vlm" class="content-panel">
+                     <div class="stack-layer"><h3><i class="material-icons">psychology</i>Model Layer</h3><p>Visual Large Models (e.g., LLaVA, IDEFICS) combine a vision encoder (like ViT) with an LLM to process images and text.</p></div>
+                     <div class="stack-layer"><h3><i class="material-icons">layers</i>Serving Stack Layer</h3><p>The stack must handle multi-modal inputs. Frameworks like <strong>vLLM</strong> and <strong>SGLang</strong> are adding native support for VLMs. A custom container is often needed to handle the specific image preprocessing logic.</p></div>
+                     <div class="stack-layer"><h3><i class="material-icons">cloud_queue</i>Orchestration Layer</h3><p>Similar to LLMs, requires robust orchestration to manage high-resource GPU pods and potentially large input payloads.</p></div>
+                     <div class="stack-layer"><h3><i class="material-icons">memory</i>Hardware Layer</h3><p><strong>GPUs:</strong> High-VRAM GPUs are mandatory due to the combined size of the vision encoder, LLM, and KV cache.</p></div>
                 </div>
                 <div id="genai-diffusion" class="content-panel">
+                     <div class="stack-layer"><h3><i class="material-icons">psychology</i>Model Layer</h3><p>Diffusion models (e.g., Stable Diffusion) generate images through an iterative denoising process, making latency a key challenge.</p></div>
+                     <div class="stack-layer"><h3><i class="material-icons">layers</i>Serving Stack Layer</h3><p>Optimizations focus on reducing latency. Key tools include model compilers like <strong>TensorRT</strong> (often used with NVIDIA Triton), techniques like <strong>Latent Consistency Models (LCMs)</strong>, and libraries like <strong>Diffusers</strong>, typically wrapped in a custom FastAPI container.</p></div>
+                     <div class="stack-layer"><h3><i class="material-icons">cloud_queue</i>Orchestration Layer</h3><p>Kubernetes or managed platforms are used to serve the GPU-intensive workload. Autoscaling is critical to handle bursty traffic patterns.</p></div>
+                     <div class="stack-layer"><h3><i class="material-icons">memory</i>Hardware Layer</h3><p><strong>GPUs:</strong> High-end consumer or datacenter GPUs are needed for acceptable generation speeds. VRAM is the most critical resource, dictating max resolution and batch size.</p></div>
                 </div>
             </div>
         </main>
     <script>
         document.addEventListener('DOMContentLoaded', function() {
+            const builder = document.getElementById('architecture-builder');
+            const generateBtn = document.getElementById('generate-btn');
+            const diagramOutput = document.getElementById('architecture-diagram-output');
+            const archTypeSelector = builder.querySelector('.arch-type-selector');
+            const classicFields = document.getElementById('classic-builder-fields');
+            const genaiFields = document.getElementById('genai-builder-fields');
+            function updateChipStates() {
+                const activeArchType = archTypeSelector.querySelector('.active').dataset.type;
+                const activeBuilderFields = (activeArchType === 'classic') ? classicFields : genaiFields;
+                if (activeArchType === 'classic') {
+                    const activeFramework = activeBuilderFields.querySelector('.selection-group[data-group="framework"] .chip.active');
+                    const torchserveChip = activeBuilderFields.querySelector('.chip[data-id="torchserve"]');
+                    const tfservingChip = activeBuilderFields.querySelector('.chip[data-id="tf-serving"]');
+                    [torchserveChip, tfservingChip].forEach(c => c.classList.remove('disabled'));
+                    if (activeFramework) {
+                        const frameworkId = activeFramework.dataset.id;
+                        const nonTfTsFrameworks = ['scikit-learn', 'xgboost', 'jax'];
+                        if (frameworkId === 'pytorch') {
+                            tfservingChip.classList.add('disabled');
+                            if(tfservingChip.classList.contains('active')) tfservingChip.classList.remove('active');
+                        } else if (frameworkId === 'tensorflow') {
+                            torchserveChip.classList.add('disabled');
+                            if(torchserveChip.classList.contains('active')) torchserveChip.classList.remove('active');
+                        } else if (nonTfTsFrameworks.includes(frameworkId)) {
+                             [torchserveChip, tfservingChip].forEach(c => {
+                                 c.classList.add('disabled');
+                                 if(c.classList.contains('active')) c.classList.remove('active');
+                             });
+                        }
+                    }
+                } else { // Gen AI Logic
+                    const activeModelType = activeBuilderFields.querySelector('.selection-group[data-group="model-type"] .chip.active');
+                    const vllmChip = activeBuilderFields.querySelector('.chip[data-id="vllm"]');
+                    const sglangChip = activeBuilderFields.querySelector('.chip[data-id="sglang"]');
+                    [vllmChip, sglangChip].forEach(c => c.classList.remove('disabled'));
+                    if (activeModelType && activeModelType.dataset.id === 'diffusion') {
+                         [vllmChip, sglangChip].forEach(c => {
+                            c.classList.add('disabled');
+                            if(c.classList.contains('active')) c.classList.remove('active');
+                         });
+                    }
+                }
+                const activeOrchestration = activeBuilderFields.querySelector('.selection-group[data-group="orchestration"] .chip.active');
+                const servingGroup = activeBuilderFields.querySelector('.selection-group[data-group="serving"]');
+                if (activeOrchestration && (activeOrchestration.dataset.id === 'vertex-ai' || activeOrchestration.dataset.id === 'sagemaker')) {
+                    servingGroup.classList.add('disabled');
+                    servingGroup.querySelector('.chip.active')?.classList.remove('active');
+                } else {
+                    servingGroup.classList.remove('disabled');
+                }
+            }
+            archTypeSelector.addEventListener('click', function(e){
+                if (!e.target.classList.contains('arch-type-chip')) return;
+                archTypeSelector.querySelectorAll('.arch-type-chip').forEach(c => c.classList.remove('active'));
+                e.target.classList.add('active');
+                const type = e.target.dataset.type;
+                classicFields.classList.toggle('active', type === 'classic');
+                genaiFields.classList.toggle('active', type === 'gen-ai');
+                diagramOutput.style.display = 'none';
+                updateChipStates();
+            });
+            builder.addEventListener('click', function(e) {
+                if (!e.target.classList.contains('chip') || e.target.classList.contains('disabled')) return;
+                const chip = e.target;
+                const group = chip.closest('.selection-group');
+                if (group.classList.contains('disabled')) return;
+                group.querySelectorAll('.chip').forEach(c => c.classList.remove('active'));
+                chip.classList.add('active');
+                updateChipStates();
+            });
+            generateBtn.addEventListener('click', function() {
+                const activeArchType = archTypeSelector.querySelector('.active').dataset.type;
+                const activeBuilderFields = document.querySelector('.builder-fields.active');
+                const selections = {};
+                let allSelected = true;
+                const isManaged = activeBuilderFields.querySelector('.selection-group[data-group="orchestration"] .chip.active')?.dataset.id.includes('vertex') ||
+                                activeBuilderFields.querySelector('.selection-group[data-group="orchestration"] .chip.active')?.dataset.id.includes('sagemaker');
+                activeBuilderFields.querySelectorAll('.selection-group').forEach(group => {
+                    const groupKey = group.dataset.group;
+                    if (isManaged && groupKey === 'serving') return;
+                    const activeChip = group.querySelector('.chip.active');
+                    if (activeChip) {
+                        selections[groupKey] = { name: activeChip.innerText, id: activeChip.dataset.id };
                     } else {
+                        allSelected = false;
                     }
                 });
+                if (!allSelected) {
+                    alert('Please make a selection for each required layer.');
+                    return;
+                }
+                let diagramHtml = `<h3 class="diagram-title">Your Custom ${activeArchType === 'gen-ai' ? 'Generative AI' : 'Classic ML'} Architecture</h3><div class="diagram-stack">`;
+                const arrowClass = activeArchType === 'gen-ai' ? 'gen-ai-arrow' : '';
+                const layerClass = activeArchType === 'gen-ai' ? 'gen-ai-layer' : '';
+                function createImageTag(selection) {
+                    return `<img src="${selection.id}.png" alt="${selection.name} Icon" class="icon-img-placeholder">`;
+                }
+                if (activeArchType === 'gen-ai') {
+                    diagramHtml += `<div class="diagram-layer ${layerClass}"><h5>${selections['model-type'].name}</h5><p>Model Type</p>${createImageTag(selections['model-type'])}</div><div class="diagram-arrow ${arrowClass}">south</div>`;
+                }
+                diagramHtml += `<div class="diagram-layer ${layerClass}"><h5>${selections.framework.name}</h5><p>ML Framework</p>${createImageTag(selections.framework)}</div><div class="diagram-arrow ${arrowClass}">south</div>`;
+                if (isManaged) {
+                     diagramHtml += `<div class="diagram-layer ${layerClass}"><h5>${selections.orchestration.name}</h5><p>Managed Platform</p>${createImageTag(selections.orchestration)}</div><div class="diagram-arrow ${arrowClass}">south</div>`;
+                } else {
+                     diagramHtml += `<div class="diagram-layer ${layerClass}"><h5>${selections.serving.name}</h5><p>Serving Container</p>${createImageTag(selections.serving)}</div><div class="diagram-arrow ${arrowClass}">south</div>`;
+                     diagramHtml += `<div class="diagram-layer ${layerClass}"><h5>${selections.orchestration.name}</h5><p>Orchestration</p>${createImageTag(selections.orchestration)}</div><div class="diagram-arrow ${arrowClass}">south</div>`;
+                }
+                diagramHtml += `<div class="diagram-layer ${layerClass}"><h5>${selections.hardware.name}</h5><p>Hardware</p>${createImageTag(selections.hardware)}</div>`;
+                diagramHtml += `</div>`;
+                diagramOutput.innerHTML = diagramHtml;
+                diagramOutput.style.display = 'block';
+                diagramOutput.scrollIntoView({ behavior: 'smooth', block: 'center' });
             });
+            const tiles = document.querySelectorAll('.tile');
+            const contentPanels = document.querySelectorAll('.content-panel');
+            tiles.forEach(tile => tile.addEventListener('click', (e) => {
+                const targetId = e.currentTarget.dataset.target;
+                tiles.forEach(t => t.classList.remove('active'));
+                e.currentTarget.classList.add('active');
+                contentPanels.forEach(p => p.classList.remove('active'));
+                const panel = document.getElementById(targetId);
+                if (panel) {
+                    panel.classList.add('active');
+                    panel.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                }
+            }));
+            document.querySelectorAll('pre code').forEach(codeBlock => {
+                const pre = codeBlock.parentElement;
+                if (!pre.querySelector('.copy-btn')) {
                     const copyButton = document.createElement('button');
                     copyButton.innerText = 'Copy';
                     copyButton.className = 'copy-btn';
+                    pre.appendChild(copyButton);
                     copyButton.addEventListener('click', (e) => {
+                        e.stopPropagation();
+                        navigator.clipboard.writeText(codeBlock.innerText).then(() => {
                             copyButton.innerText = 'Copied!';
                             copyButton.classList.add('copied');
+                            setTimeout(() => { copyButton.innerText = 'Copy'; copyButton.classList.remove('copied'); }, 2000);
                         });
                     });
                 }
             });
+            updateChipStates();
         });
     </script>
 </body>