| | """ |
| | GPT-300M Visual Neural Network β Node & Connection Style |
| | ========================================================== |
| | Generates a classic neural network diagram (like the user's reference) |
| | with nodes and connection lines, accurately showing the GPT-300M architecture |
| | with correct parameter calculations at each layer. |
| | """ |
| |
|
| | import matplotlib |
| | matplotlib.use("Agg") |
| |
|
| | import matplotlib.pyplot as plt |
| | import matplotlib.patches as mpatches |
| | import numpy as np |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | VOCAB_SIZE = 32_000 |
| | D_MODEL = 1_024 |
| | N_HEADS = 16 |
| | HEAD_DIM = 64 |
| | D_FF = 4_096 |
| | N_LAYERS = 24 |
| |
|
| | |
| | embed_params = VOCAB_SIZE * D_MODEL |
| | |
| | rope_params = 0 |
| |
|
| | |
| | qkv_params = 3 * D_MODEL * D_MODEL |
| | out_proj_params = D_MODEL * D_MODEL |
| | attn_total = qkv_params + out_proj_params |
| |
|
| | ffn_up_params = D_MODEL * D_FF |
| | ffn_down_params = D_FF * D_MODEL |
| | ffn_total = ffn_up_params + ffn_down_params |
| |
|
| | rmsnorm_params = D_MODEL * 2 |
| | layer_total = attn_total + ffn_total + rmsnorm_params |
| |
|
| | all_layers_total = layer_total * N_LAYERS |
| |
|
| | final_norm_params = D_MODEL |
| | |
| | lm_head_params = 0 |
| |
|
| | TOTAL_PARAMS = embed_params + all_layers_total + final_norm_params + lm_head_params |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | LAYERS = [ |
| | ("Input Tokens", 10, VOCAB_SIZE, 0, "#4CAF50"), |
| | ("Token Embedding", 10, D_MODEL, embed_params, "#2196F3"), |
| | ("RoPE Positions", 10, D_MODEL, 0, "#00BCD4"), |
| |
|
| | |
| | ("Layer 1: Attention Q,K,V", 12, D_MODEL, qkv_params, "#FF9800"), |
| | ("Layer 1: Attention Out", 10, D_MODEL, out_proj_params, "#FF9800"), |
| | ("Layer 1: FFN Up", 14, D_FF, ffn_up_params, "#8BC34A"), |
| | ("Layer 1: FFN Down", 10, D_MODEL, ffn_down_params, "#8BC34A"), |
| |
|
| | ("Layer 2β23: Γ22 Blocks", 12, D_MODEL, layer_total * 22, "#9C27B0"), |
| |
|
| | ("Layer 24: Attention", 12, D_MODEL, attn_total, "#FF5722"), |
| | ("Layer 24: FFN", 14, D_FF, ffn_total, "#009688"), |
| | ("Layer 24: Output", 10, D_MODEL, rmsnorm_params, "#009688"), |
| |
|
| | ("Final RMSNorm", 10, D_MODEL, final_norm_params, "#E91E63"), |
| | ("LM Head (tied)", 10, VOCAB_SIZE, lm_head_params, "#F44336"), |
| | ("Output Probabilities", 1, VOCAB_SIZE, 0, "#F44336"), |
| | ] |
| |
|
| |
|
| | def draw_neural_network(save_path="neural_network.png"): |
| | fig, ax = plt.subplots(figsize=(22, 30), facecolor="#0D1117") |
| | ax.set_facecolor("#0D1117") |
| |
|
| | n_layers = len(LAYERS) |
| | y_positions = np.linspace(0.92, 0.04, n_layers) |
| |
|
| | |
| | x_center = 0.5 |
| | max_spread = 0.38 |
| |
|
| | all_node_positions = [] |
| |
|
| | running_params = 0 |
| |
|
| | for i, (name, n_display, actual_size, params, color) in enumerate(LAYERS): |
| | y = y_positions[i] |
| | running_params += params |
| |
|
| | |
| | if n_display == 1: |
| | xs = [x_center] |
| | else: |
| | xs = np.linspace(x_center - max_spread, x_center + max_spread, n_display) |
| |
|
| | all_node_positions.append((xs, y)) |
| |
|
| | |
| | if i > 0: |
| | prev_xs, prev_y = all_node_positions[i - 1] |
| |
|
| | |
| | max_connections = 200 |
| | step_curr = max(1, len(xs) // 12) |
| | step_prev = max(1, len(prev_xs) // 12) |
| |
|
| | conn_count = 0 |
| | for px in prev_xs[::step_prev]: |
| | for cx in xs[::step_curr]: |
| | if conn_count > max_connections: |
| | break |
| | ax.plot( |
| | [px, cx], [prev_y, y], |
| | color=color, alpha=0.22, linewidth=0.6, |
| | transform=ax.transAxes, zorder=1, |
| | ) |
| | conn_count += 1 |
| |
|
| | |
| | node_radius = 0.01 if n_display <= 12 else 0.008 |
| | if n_display == 1: |
| | node_radius = 0.016 |
| |
|
| | for x in xs: |
| | circle = plt.Circle( |
| | (x, y), node_radius, |
| | facecolor=color, edgecolor="white", |
| | linewidth=0.6, alpha=0.95, |
| | transform=ax.transAxes, zorder=3, |
| | ) |
| | ax.add_patch(circle) |
| |
|
| | |
| | if actual_size > n_display and n_display > 1: |
| | extra = actual_size - n_display |
| | if extra > 0: |
| | ax.text( |
| | xs[-1] + 0.03, y, |
| | f"(+{extra:,})", |
| | transform=ax.transAxes, |
| | fontsize=7, color="#8B949E", |
| | ha="left", va="center", |
| | fontfamily="monospace", |
| | ) |
| |
|
| | |
| | ax.text( |
| | 0.02, y, |
| | name, |
| | transform=ax.transAxes, |
| | fontsize=9, fontweight="bold", |
| | color="#E6EDF3", |
| | ha="left", va="center", |
| | fontfamily="monospace", |
| | ) |
| |
|
| | |
| | if params > 0: |
| | param_text = f"{params:,} params" |
| | ax.text( |
| | 0.98, y, |
| | param_text, |
| | transform=ax.transAxes, |
| | fontsize=8, |
| | color=color, |
| | ha="right", va="center", |
| | fontfamily="monospace", |
| | fontweight="bold", |
| | ) |
| |
|
| | |
| | if running_params > 0: |
| | ax.text( |
| | 0.98, y - 0.012, |
| | f"Ξ£ {running_params / 1e6:.1f}M", |
| | transform=ax.transAxes, |
| | fontsize=6.5, |
| | color="#8B949E", |
| | ha="right", va="center", |
| | fontfamily="monospace", |
| | ) |
| |
|
| | |
| | ax.text( |
| | 0.5, 0.97, |
| | "GPT-300M Neural Network", |
| | transform=ax.transAxes, |
| | fontsize=24, fontweight="bold", |
| | color="#E6EDF3", ha="center", va="center", |
| | fontfamily="monospace", |
| | ) |
| | ax.text( |
| | 0.5, 0.955, |
| | f"Total: {TOTAL_PARAMS:,} parameters β’ {N_LAYERS} transformer layers β’ " |
| | f"{N_HEADS} attention heads β’ d_model={D_MODEL}", |
| | transform=ax.transAxes, |
| | fontsize=9, color="#8B949E", ha="center", va="center", |
| | fontfamily="monospace", |
| | ) |
| |
|
| | |
| | summary_y = 0.005 |
| | summary_text = ( |
| | f"ββββββββββββββββ Parameter Summary ββββββββββββββββ\n" |
| | f"β Token Embedding: {embed_params:>13,} ({embed_params/TOTAL_PARAMS*100:4.1f}%) β\n" |
| | f"β Attention (Γ{N_LAYERS}): {attn_total*N_LAYERS:>13,} ({attn_total*N_LAYERS/TOTAL_PARAMS*100:4.1f}%) β\n" |
| | f"β Feed-Forward (Γ{N_LAYERS}): {ffn_total*N_LAYERS:>13,} ({ffn_total*N_LAYERS/TOTAL_PARAMS*100:4.1f}%) β\n" |
| | f"β RMSNorm (Γ{N_LAYERS}+1): {rmsnorm_params*N_LAYERS+final_norm_params:>13,} ({(rmsnorm_params*N_LAYERS+final_norm_params)/TOTAL_PARAMS*100:4.1f}%) β\n" |
| | f"β LM Head (tied): {'0 (shared)':>13} β\n" |
| | f"βββββββββββββββββββββββββββββββββββββββββββββββββββ€\n" |
| | f"β TOTAL: {TOTAL_PARAMS:>13,} (100%) β\n" |
| | f"βββββββββββββββββββββββββββββββββββββββββββββββββββ" |
| | ) |
| | ax.text( |
| | 0.5, summary_y, |
| | summary_text, |
| | transform=ax.transAxes, |
| | fontsize=8, color="#58A6FF", |
| | ha="center", va="bottom", |
| | fontfamily="monospace", |
| | bbox=dict(boxstyle="round,pad=0.8", facecolor="#161B22", |
| | edgecolor="#30363D", linewidth=1), |
| | ) |
| |
|
| | |
| | legend_items = [ |
| | ("#4CAF50", "Input / Tokenization"), |
| | ("#2196F3", "Embeddings"), |
| | ("#FF9800", "Self-Attention"), |
| | ("#8BC34A", "Feed-Forward (GELU)"), |
| | ("#9C27B0", "Collapsed Layers (Γ22)"), |
| | ("#E91E63", "Normalization"), |
| | ("#F44336", "Output / LM Head"), |
| | ] |
| | for j, (c, label) in enumerate(legend_items): |
| | lx = 0.02 |
| | ly = 0.035 - j * 0.015 |
| | circle = plt.Circle( |
| | (lx, ly), 0.004, |
| | facecolor=c, edgecolor="white", linewidth=0.3, |
| | transform=ax.transAxes, zorder=5, |
| | ) |
| | ax.add_patch(circle) |
| | ax.text( |
| | lx + 0.012, ly, label, |
| | transform=ax.transAxes, |
| | fontsize=7, color="#C9D1D9", va="center", |
| | fontfamily="monospace", |
| | ) |
| |
|
| | ax.set_xlim(0, 1) |
| | ax.set_ylim(0, 1) |
| | ax.axis("off") |
| |
|
| | plt.savefig(save_path, dpi=200, bbox_inches="tight", |
| | facecolor="#0D1117", edgecolor="none") |
| | print(f"Saved: {save_path}") |
| | plt.close() |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def draw_single_layer_detail(save_path="layer_detail.png"): |
| | """Draw a detailed view of one transformer layer with node connections.""" |
| | fig, ax = plt.subplots(figsize=(20, 14), facecolor="#0D1117") |
| | ax.set_facecolor("#0D1117") |
| |
|
| | |
| | |
| | |
| |
|
| | sub_layers = [ |
| | ("Input\n(d=1,024)", 8, D_MODEL, 0, "#2196F3"), |
| | ("Query\n(d=1,024)", 8, D_MODEL, D_MODEL**2, "#FF6B6B"), |
| | ("Key\n(d=1,024)", 8, D_MODEL, D_MODEL**2, "#4ECDC4"), |
| | ("Value\n(d=1,024)", 8, D_MODEL, D_MODEL**2, "#45B7D1"), |
| | ("Attention Heads\n(16Γ64)", 16, D_MODEL, 0, "#FF9800"), |
| | ("Attn Output\n(d=1,024)", 8, D_MODEL, D_MODEL**2, "#FF9800"), |
| | ("β Residual + Norm", 8, D_MODEL, D_MODEL, "#E91E63"), |
| | ("FFN Up (GELU)\n(d=4,096)", 14, D_FF, D_MODEL*D_FF, "#8BC34A"), |
| | ("FFN Down\n(d=1,024)", 8, D_MODEL, D_FF*D_MODEL, "#8BC34A"), |
| | ("β Residual + Norm", 8, D_MODEL, D_MODEL, "#E91E63"), |
| | ("Layer Output\n(d=1,024)", 8, D_MODEL, 0, "#2196F3"), |
| | ] |
| |
|
| | n = len(sub_layers) |
| | y_positions = np.linspace(0.9, 0.08, n) |
| | x_center = 0.5 |
| | max_spread = 0.32 |
| |
|
| | all_pos = [] |
| |
|
| | for i, (name, n_nodes, actual, params, color) in enumerate(sub_layers): |
| | y = y_positions[i] |
| | xs = np.linspace(x_center - max_spread, x_center + max_spread, n_nodes) |
| | all_pos.append((xs, y)) |
| |
|
| | |
| | if i > 0: |
| | prev_xs, prev_y = all_pos[i-1] |
| | step_c = max(1, len(xs) // 10) |
| | step_p = max(1, len(prev_xs) // 10) |
| | for px in prev_xs[::step_p]: |
| | for cx in xs[::step_c]: |
| | ax.plot([px, cx], [prev_y, y], |
| | color=color, alpha=0.2, linewidth=0.7, |
| | transform=ax.transAxes, zorder=1) |
| |
|
| | |
| | r = 0.011 if n_nodes <= 10 else 0.009 |
| | for x in xs: |
| | c = plt.Circle((x, y), r, facecolor=color, edgecolor="white", |
| | linewidth=0.6, alpha=0.95, |
| | transform=ax.transAxes, zorder=3) |
| | ax.add_patch(c) |
| |
|
| | |
| | if actual > n_nodes: |
| | ax.text(xs[-1] + 0.025, y, f"(+{actual - n_nodes:,})", |
| | transform=ax.transAxes, fontsize=7, color="#8B949E", |
| | ha="left", va="center", fontfamily="monospace") |
| |
|
| | |
| | ax.text(0.03, y, name, transform=ax.transAxes, |
| | fontsize=9, fontweight="bold", color="#E6EDF3", |
| | ha="left", va="center", fontfamily="monospace") |
| |
|
| | |
| | if params > 0: |
| | ax.text(0.97, y, f"{params:,}", transform=ax.transAxes, |
| | fontsize=8, color=color, ha="right", va="center", |
| | fontfamily="monospace", fontweight="bold") |
| |
|
| | |
| | ax.text(0.5, 0.96, "Single Transformer Layer β Detailed View", |
| | transform=ax.transAxes, fontsize=18, fontweight="bold", |
| | color="#E6EDF3", ha="center", fontfamily="monospace") |
| | ax.text(0.5, 0.935, |
| | f"Parameters per layer: {layer_total:,} β’ Γ{N_LAYERS} layers = {all_layers_total:,} total", |
| | transform=ax.transAxes, fontsize=9, color="#8B949E", |
| | ha="center", fontfamily="monospace") |
| |
|
| | ax.set_xlim(0, 1) |
| | ax.set_ylim(0, 1) |
| | ax.axis("off") |
| |
|
| | plt.savefig(save_path, dpi=200, bbox_inches="tight", |
| | facecolor="#0D1117", edgecolor="none") |
| | print(f"Saved: {save_path}") |
| | plt.close() |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import os |
| | os.makedirs("viz", exist_ok=True) |
| |
|
| | print("=" * 50) |
| | print(" GPT-300M Parameter Verification") |
| | print("=" * 50) |
| | print(f" Token Embedding: {embed_params:>13,}") |
| | print(f" Per-layer Attention: {attn_total:>13,}") |
| | print(f" Per-layer FFN: {ffn_total:>13,}") |
| | print(f" Per-layer Norm: {rmsnorm_params:>13,}") |
| | print(f" Per-layer Total: {layer_total:>13,}") |
| | print(f" All {N_LAYERS} layers: {all_layers_total:>13,}") |
| | print(f" Final Norm: {final_norm_params:>13,}") |
| | print(f" LM Head (tied): {'0 (shared)':>13}") |
| | print(f" βββββββββββββββββββββββββββββββββ") |
| | print(f" TOTAL: {TOTAL_PARAMS:>13,}") |
| | print(f" β {TOTAL_PARAMS / 1e6:.1f}M parameters") |
| | print("=" * 50) |
| |
|
| | print("\nGenerating full network diagram...") |
| | draw_neural_network("viz/neural_network_full.png") |
| |
|
| | print("Generating single-layer detail...") |
| | draw_single_layer_detail("viz/neural_network_layer.png") |
| |
|
| | print("\nDone!") |
| |
|