""" GPT-300M Neural Network Visualizer ==================================== Generates detailed architectural diagrams of the GPT-300M model using matplotlib, showing: - Full model architecture flow - Detailed transformer block internals - Attention head visualization - Parameter distribution charts Usage: python visualize_nn.py python visualize_nn.py --output architecture.png """ import argparse import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.patches as patches from matplotlib.patches import FancyBboxPatch, FancyArrowPatch import numpy as np from config import GPT300MConfig, gpt_300m # ═══════════════════════════════════════════════════════════════════════ # COLOR SCHEME # ═══════════════════════════════════════════════════════════════════════ COLORS = { "bg": "#0D1117", "text": "#E6EDF3", "text_dim": "#8B949E", "embed": "#58A6FF", # Blue "attn": "#F78166", # Orange "ffn": "#7EE787", # Green "norm": "#D2A8FF", # Purple "residual": "#FFA657", # Yellow-orange "output": "#FF7B72", # Red "arrow": "#484F58", "highlight": "#1F6FEB", "border": "#30363D", "card_bg": "#161B22", "accent1": "#79C0FF", "accent2": "#BB9AF7", } def draw_rounded_box(ax, x, y, w, h, color, label, fontsize=10, text_color=None, alpha=0.9, sublabel=None): """Draw a rounded rectangle with label.""" box = FancyBboxPatch( (x - w/2, y - h/2), w, h, boxstyle="round,pad=0.1", facecolor=color, edgecolor="white", linewidth=0.5, alpha=alpha, zorder=3, ) ax.add_patch(box) ax.text( x, y + (0.15 if sublabel else 0), label, ha="center", va="center", fontsize=fontsize, fontweight="bold", color=text_color or COLORS["text"], zorder=4, ) if sublabel: ax.text( x, y - 0.25, sublabel, ha="center", va="center", fontsize=fontsize - 2, color=COLORS["text_dim"], zorder=4, ) def draw_arrow(ax, x1, y1, x2, y2, color=None): """Draw an arrow between two points.""" ax.annotate( "", xy=(x2, y2), xytext=(x1, y1), arrowprops=dict( arrowstyle="->", color=color or COLORS["arrow"], lw=1.5, connectionstyle="arc3,rad=0", ), zorder=2, ) def draw_residual_connection(ax, x_start, y_start, x_end, y_end, offset=1.8): """Draw a residual/skip connection arc.""" ax.annotate( "", xy=(x_end, y_end), xytext=(x_start, y_start), arrowprops=dict( arrowstyle="->", color=COLORS["residual"], lw=1.2, linestyle="--", connectionstyle=f"arc3,rad=0.3", ), zorder=1, ) # ═══════════════════════════════════════════════════════════════════════ # FULL ARCHITECTURE DIAGRAM # ═══════════════════════════════════════════════════════════════════════ def draw_full_architecture(config: GPT300MConfig, save_path: str = None): """Draw the complete GPT-300M architecture.""" fig, ax = plt.subplots(1, 1, figsize=(14, 24), facecolor=COLORS["bg"]) ax.set_facecolor(COLORS["bg"]) ax.set_xlim(-4, 4) ax.set_ylim(-1, 22) ax.axis("off") # Title ax.text(0, 21.5, "GPT-300M Architecture", ha="center", va="center", fontsize=22, fontweight="bold", color=COLORS["text"], fontfamily="monospace") ax.text(0, 21.0, f"{config.total_params_estimate:,} parameters • " f"{config.n_layers} layers • " f"{config.n_heads} heads • " f"d={config.d_model}", ha="center", va="center", fontsize=10, color=COLORS["text_dim"], fontfamily="monospace") y = 19.5 # Starting y position # ── Input ────────────────────────────────────────────────────── draw_rounded_box(ax, 0, y, 3.5, 0.7, COLORS["card_bg"], "Input Token IDs", sublabel=f"[batch, seq_len]", fontsize=11) y -= 1.1 draw_arrow(ax, 0, y + 0.8, 0, y + 0.4) # ── Token Embedding ──────────────────────────────────────────── draw_rounded_box(ax, 0, y, 3.5, 0.7, COLORS["embed"], "Token Embedding", text_color="#000", sublabel=f"{config.vocab_size:,} × {config.d_model}") y -= 1.1 draw_arrow(ax, 0, y + 0.8, 0, y + 0.4) # ── RoPE ─────────────────────────────────────────────────────── draw_rounded_box(ax, 0, y, 3.5, 0.6, COLORS["accent2"], "Rotary Position Embeddings (RoPE)", text_color="#000", fontsize=9, sublabel=f"θ = {config.rope_theta:.0f}") y -= 1.0 draw_arrow(ax, 0, y + 0.7, 0, y + 0.4) # ── Dropout ──────────────────────────────────────────────────── draw_rounded_box(ax, 0, y, 2.5, 0.5, COLORS["border"], f"Dropout (p={config.dropout})", fontsize=9) y -= 1.0 draw_arrow(ax, 0, y + 0.7, 0, y + 0.35) # ── Transformer Blocks ───────────────────────────────────────── block_height = 3.2 # Draw detailed first block block_y_start = y block_y_end = y - block_height # Block container block_box = FancyBboxPatch( (-3.3, block_y_end - 0.1), 6.6, block_height + 0.2, boxstyle="round,pad=0.15", facecolor=COLORS["card_bg"], edgecolor=COLORS["highlight"], linewidth=1.5, alpha=0.8, zorder=1, ) ax.add_patch(block_box) ax.text(-3.0, block_y_start + 0.05, f"Transformer Block × {config.n_layers}", fontsize=10, fontweight="bold", color=COLORS["highlight"], fontfamily="monospace", zorder=5) # Inside the block by = block_y_start - 0.4 # RMSNorm 1 draw_rounded_box(ax, 0, by, 2.8, 0.45, COLORS["norm"], "RMSNorm", text_color="#000", fontsize=9) by -= 0.7 draw_arrow(ax, 0, by + 0.5, 0, by + 0.25) # Multi-Head Attention draw_rounded_box(ax, 0, by, 2.8, 0.7, COLORS["attn"], "Multi-Head Attention", text_color="#000", fontsize=10, sublabel=f"{config.n_heads} heads × {config.head_dim}d") # Residual connection draw_residual_connection(ax, -1.6, block_y_start - 0.2, -1.6, by) ax.text(-2.5, by + 0.3, "⊕ residual", fontsize=7, color=COLORS["residual"], ha="center") by -= 0.8 draw_arrow(ax, 0, by + 0.5, 0, by + 0.25) # RMSNorm 2 draw_rounded_box(ax, 0, by, 2.8, 0.45, COLORS["norm"], "RMSNorm", text_color="#000", fontsize=9) by -= 0.7 draw_arrow(ax, 0, by + 0.5, 0, by + 0.25) # Feed-Forward Network draw_rounded_box(ax, 0, by, 2.8, 0.7, COLORS["ffn"], "Feed-Forward Network", text_color="#000", fontsize=10, sublabel=f"{config.d_model} → {config.d_ff} → {config.d_model}") # Residual connection draw_residual_connection(ax, 1.6, by + 1.5, 1.6, by) ax.text(2.5, by + 0.7, "⊕ residual", fontsize=7, color=COLORS["residual"], ha="center") y = block_y_end - 0.4 # ── Repeated blocks indicator ────────────────────────────────── draw_arrow(ax, 0, y + 0.2, 0, y - 0.1) ax.text(0, y - 0.3, f"× {config.n_layers} layers", ha="center", fontsize=11, fontweight="bold", color=COLORS["text_dim"], fontfamily="monospace", bbox=dict(boxstyle="round,pad=0.3", facecolor=COLORS["card_bg"], edgecolor=COLORS["border"])) y -= 0.9 draw_arrow(ax, 0, y + 0.3, 0, y + 0.05) # ── Final RMSNorm ────────────────────────────────────────────── draw_rounded_box(ax, 0, y - 0.2, 3.5, 0.5, COLORS["norm"], "Final RMSNorm", text_color="#000", fontsize=10) y -= 1.0 draw_arrow(ax, 0, y + 0.5, 0, y + 0.2) # ── LM Head ──────────────────────────────────────────────────── draw_rounded_box(ax, 0, y - 0.1, 3.5, 0.7, COLORS["output"], "Linear (LM Head)", text_color="#000", fontsize=11, sublabel=f"{config.d_model} → {config.vocab_size:,} (weight-tied)") y -= 1.1 draw_arrow(ax, 0, y + 0.7, 0, y + 0.35) # ── Softmax / Output ─────────────────────────────────────────── draw_rounded_box(ax, 0, y, 3.5, 0.6, COLORS["card_bg"], "Softmax → Next Token Probabilities", fontsize=10, sublabel=f"[batch, seq_len, {config.vocab_size:,}]") plt.tight_layout() if save_path: fig.savefig(save_path, dpi=200, bbox_inches="tight", facecolor=COLORS["bg"], edgecolor="none") print(f"Saved architecture diagram: {save_path}") return fig # ═══════════════════════════════════════════════════════════════════════ # PARAMETER DISTRIBUTION CHART # ═══════════════════════════════════════════════════════════════════════ def draw_parameter_chart(config: GPT300MConfig, save_path: str = None): """Draw a parameter distribution breakdown.""" fig, axes = plt.subplots(1, 2, figsize=(16, 7), facecolor=COLORS["bg"]) # Calculate parameter counts per component emb_params = config.vocab_size * config.d_model attn_params = 4 * config.d_model * config.d_model * config.n_layers ffn_params = 2 * config.d_model * config.d_ff * config.n_layers norm_params = 2 * config.d_model * config.n_layers + config.d_model total = emb_params + attn_params + ffn_params + norm_params # ── Pie Chart ────────────────────────────────────────────────── ax = axes[0] ax.set_facecolor(COLORS["bg"]) labels = ["Token\nEmbedding", "Attention\nLayers", "Feed-Forward\nLayers", "LayerNorm"] sizes = [emb_params, attn_params, ffn_params, norm_params] colors = [COLORS["embed"], COLORS["attn"], COLORS["ffn"], COLORS["norm"]] wedges, texts, autotexts = ax.pie( sizes, labels=None, autopct=lambda p: f"{p:.1f}%", colors=colors, startangle=90, pctdistance=0.7, wedgeprops=dict(width=0.5, edgecolor=COLORS["bg"], linewidth=2), textprops=dict(color=COLORS["text"], fontsize=10), ) for at in autotexts: at.set_fontweight("bold") at.set_color("#000") # Legend legend_labels = [ f"{l}\n({s/1e6:.1f}M)" for l, s in zip( ["Token Embedding", "Attention", "Feed-Forward", "LayerNorm"], sizes ) ] ax.legend( wedges, legend_labels, loc="center left", bbox_to_anchor=(1.05, 0.5), fontsize=9, frameon=False, labelcolor=COLORS["text"], ) ax.set_title("Parameter Distribution", fontsize=14, fontweight="bold", color=COLORS["text"], pad=15) # ── Per-Layer Breakdown Bar Chart ────────────────────────────── ax = axes[1] ax.set_facecolor(COLORS["bg"]) layer_attn = 4 * config.d_model * config.d_model layer_ffn = 2 * config.d_model * config.d_ff layer_norm = 2 * config.d_model layers = range(1, config.n_layers + 1) bar_width = 0.8 ax.bar(layers, [layer_attn / 1e6] * config.n_layers, bar_width, label="Attention", color=COLORS["attn"], alpha=0.9) ax.bar(layers, [layer_ffn / 1e6] * config.n_layers, bar_width, bottom=[layer_attn / 1e6] * config.n_layers, label="Feed-Forward", color=COLORS["ffn"], alpha=0.9) ax.bar(layers, [layer_norm / 1e6] * config.n_layers, bar_width, bottom=[(layer_attn + layer_ffn) / 1e6] * config.n_layers, label="Norm", color=COLORS["norm"], alpha=0.9) ax.set_xlabel("Layer", fontsize=11, color=COLORS["text"]) ax.set_ylabel("Parameters (M)", fontsize=11, color=COLORS["text"]) ax.set_title("Parameters Per Layer", fontsize=14, fontweight="bold", color=COLORS["text"], pad=15) ax.legend(fontsize=9, frameon=False, labelcolor=COLORS["text"]) ax.tick_params(colors=COLORS["text_dim"]) ax.spines["bottom"].set_color(COLORS["border"]) ax.spines["left"].set_color(COLORS["border"]) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) # Overall title fig.suptitle( f"GPT-300M • {total:,} Total Parameters", fontsize=16, fontweight="bold", color=COLORS["text"], fontfamily="monospace", y=1.02, ) plt.tight_layout() if save_path: fig.savefig(save_path, dpi=200, bbox_inches="tight", facecolor=COLORS["bg"], edgecolor="none") print(f"Saved parameter chart: {save_path}") return fig # ═══════════════════════════════════════════════════════════════════════ # ATTENTION HEAD VISUALIZATION # ═══════════════════════════════════════════════════════════════════════ def draw_attention_heads(config: GPT300MConfig, save_path: str = None): """Visualize the multi-head attention mechanism.""" fig, ax = plt.subplots(1, 1, figsize=(14, 10), facecolor=COLORS["bg"]) ax.set_facecolor(COLORS["bg"]) ax.set_xlim(-1, 11) ax.set_ylim(-1, 8) ax.axis("off") ax.text(5, 7.5, "Multi-Head Self-Attention", ha="center", fontsize=18, fontweight="bold", color=COLORS["text"], fontfamily="monospace") ax.text(5, 7.0, f"{config.n_heads} heads × {config.head_dim}d per head = {config.d_model}d total", ha="center", fontsize=10, color=COLORS["text_dim"]) # Input draw_rounded_box(ax, 5, 6.2, 4, 0.5, COLORS["embed"], f"Input: [B, T, {config.d_model}]", text_color="#000", fontsize=9) # Q, K, V projections for i, (name, color) in enumerate(zip(["Q", "K", "V"], ["#FF6B6B", "#4ECDC4", "#45B7D1"])): x = 2 + i * 3 draw_arrow(ax, 5, 5.9, x, 5.4) draw_rounded_box(ax, x, 5.1, 1.8, 0.5, color, f"W_{name}", text_color="#000", fontsize=10, sublabel=f"{config.d_model}×{config.d_model}") # Heads head_y = 3.8 n_show = min(config.n_heads, 8) head_spacing = 9.0 / n_show for h in range(n_show): hx = 1 + h * head_spacing # Head box box = FancyBboxPatch( (hx - 0.4, head_y - 0.3), 0.8, 0.6, boxstyle="round,pad=0.05", facecolor=COLORS["attn"], edgecolor="white", linewidth=0.5, alpha=0.8, zorder=3, ) ax.add_patch(box) ax.text(hx, head_y, f"H{h+1}", ha="center", va="center", fontsize=8, fontweight="bold", color="#000", zorder=4) # Arrows from Q,K,V to heads for qi, qx in enumerate([2, 5, 8]): ax.annotate("", xy=(hx, head_y + 0.3), xytext=(qx, 4.8), arrowprops=dict(arrowstyle="-", color=COLORS["arrow"], lw=0.3, alpha=0.3), zorder=1) if config.n_heads > 8: ax.text(5, head_y - 0.6, f"... ({config.n_heads} heads total)", ha="center", fontsize=9, color=COLORS["text_dim"]) # Attention computation draw_rounded_box(ax, 5, 2.5, 6, 0.6, COLORS["card_bg"], "Scaled Dot-Product: softmax(QK^T / √d_k) × V", fontsize=10) for h in range(n_show): hx = 1 + h * head_spacing draw_arrow(ax, hx, head_y - 0.3, 5, 2.85) # Concatenate draw_arrow(ax, 5, 2.15, 5, 1.75) draw_rounded_box(ax, 5, 1.5, 4, 0.5, COLORS["accent1"], "Concat → W_O projection", text_color="#000", fontsize=10) # Output draw_arrow(ax, 5, 1.2, 5, 0.8) draw_rounded_box(ax, 5, 0.5, 4, 0.5, COLORS["ffn"], f"Output: [B, T, {config.d_model}]", text_color="#000", fontsize=9) plt.tight_layout() if save_path: fig.savefig(save_path, dpi=200, bbox_inches="tight", facecolor=COLORS["bg"], edgecolor="none") print(f"Saved attention diagram: {save_path}") return fig # ═══════════════════════════════════════════════════════════════════════ # MAIN # ═══════════════════════════════════════════════════════════════════════ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Visualize GPT-300M Architecture") parser.add_argument("--output", type=str, default="./viz", help="Output directory for images") args = parser.parse_args() import os os.makedirs(args.output, exist_ok=True) config = gpt_300m() print(f"Generating visualizations for GPT-300M ({config.total_params_estimate:,} params)...") draw_full_architecture(config, os.path.join(args.output, "architecture.png")) draw_parameter_chart(config, os.path.join(args.output, "parameters.png")) draw_attention_heads(config, os.path.join(args.output, "attention.png")) print("Done! All visualizations saved.")