# LLM Token & Attention Explorer with Streamlit
# Features: Tokenization, OpenAI Embeddings, Positional Encoding, Final Tensor, Multi-Head Attention Simulation

import streamlit as st
import numpy as np
import tiktoken
import os
from openai import OpenAI
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

st.set_page_config(page_title="LLM Token Explorer", layout="centered")
st.title("🧠 LLM Attention Explorer: Tokens, Embeddings, Positional Encoding, and Multi-Head Visualization")

# Introductory Explanations
with st.expander("ℹ️ About This App", expanded=True):
    st.markdown("""
    This interactive app lets you explore how Large Language Models (LLMs) like GPT-3/4 work internally.
    You'll learn about tokenization, embeddings, positional encoding, and multi-head self-attention through
    real-time visualizations and simulations.
    """)

with st.expander("🧾 What is a Token?"):
    st.markdown("""
    A token is a basic unit of text. It could be as small as a character or as large as a word depending on the tokenizer.
    GPT models use subword tokenization (like Byte-Pair Encoding), meaning common patterns get their own token.
    For example:
    - "apple" → might be 1 token
    - "unhappiness" → might be split into ["un", "happiness"]
    """)

with st.expander("📌 What Are Embeddings?"):
    st.markdown("""
    Embeddings are high-dimensional vectors that represent the meaning of each token.
    Similar tokens (like 'cat' and 'dog') have embeddings that are close in space.
    They're used by the model to perform mathematical operations on language.
    """)

with st.expander("📍 Why Positional Encoding?"):
    st.markdown("""
    Since transformers process all tokens in parallel and not sequentially, they need to know token positions.
    Positional encodings are added to token embeddings to give each token a unique place in the sequence.
    """)

with st.expander("🧠 What is Self-Attention?"):
    st.markdown("""
    Self-attention allows the model to weigh the importance of each token in a sentence when encoding a specific token.
    For example, in "The cat sat because it was tired", attention helps "it" focus more on "cat" than other words.
    """)

with st.expander("🔁 Understanding Multi-Head Attention"):
    st.markdown("""
    Each attention head learns different aspects of language.
    For example:
    - One head might learn grammar structure.
    - Another might learn long-distance relationships.

    Heads run in parallel and their outputs are concatenated to form a rich representation of each token.
    """)

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

st.text(f"OpenAI key found: {'Yes' if os.getenv('OPENAI_API_KEY') else 'No'}")

st.header("✍️ Input Text")
input_text = st.text_area("Enter your text:", height=150)

tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"])

if input_text:
    st.subheader("🔤 Tokenization")
    enc = tiktoken.get_encoding(tokenizer_name)
    tokens = enc.encode(input_text)
    token_strings = [enc.decode([t]) for t in tokens]

    with st.expander("🧾 Token IDs", expanded=True):
        st.write(tokens)
    with st.expander("📖 Decoded Tokens", expanded=True):
        st.write(token_strings)

    st.info(f"Token count: {len(tokens)}")

    fig, ax = plt.subplots()
    ax.bar(range(len(tokens)), tokens, tick_label=token_strings)
    ax.set_xlabel("Token")
    ax.set_ylabel("Token ID")
    ax.set_title("Token IDs for Input Text")
    plt.xticks(rotation=45, ha='right')
    st.pyplot(fig)

    st.subheader("🔗 OpenAI Token Embeddings")
    embeddings = []
    for tok in token_strings:
        response = client.embeddings.create(input=[tok], model="text-embedding-ada-002")
        embedding = response.data[0].embedding
        embeddings.append(embedding)

        with st.expander(f"🔸 '{tok}' Embedding", expanded=True):
            st.write(embedding)
            fig, ax = plt.subplots(figsize=(8, 1))
            sns.heatmap(np.array(embedding).reshape(1, -1), cmap="viridis", cbar=True, ax=ax)
            ax.set_title("Embedding Heatmap")
            ax.axis('off')
            st.pyplot(fig)

    st.success("Generated embeddings for all tokens.")

    st.subheader("📍 Positional Encoding")
    def get_positional_encoding(seq_len, dim):
        PE = np.zeros((seq_len, dim))
        for pos in range(seq_len):
            for i in range(0, dim, 2):
                div_term = np.exp(i * -np.log(10000.0) / dim)
                PE[pos, i] = np.sin(pos * div_term)
                if i+1 < dim:
                    PE[pos, i+1] = np.cos(pos * div_term)
        return PE

    dim = len(embeddings[0])
    PE = get_positional_encoding(len(tokens), dim)

    with st.expander("📐 Positional Encoding Matrix", expanded=True):
        st.write(PE)

    st.subheader("🧮 Final Input Tensor (Embedding + PE)")
    embedded = np.array(embeddings)
    combined = embedded + PE
    with st.expander("🧾 Final Tensor", expanded=True):
        st.write(combined)

    st.subheader("🧠 Simulated Multi-Head Self-Attention")
    if st.button("Simulate Attention"):
        embed_dim = 32
        num_heads = 4
        head_dim = embed_dim // num_heads

        x = np.random.randn(len(tokens), embed_dim)
        W_q, W_k, W_v = [np.random.randn(embed_dim, embed_dim) for _ in range(3)]

        Q = x @ W_q
        K = x @ W_k
        V = x @ W_v

        def split_heads(t):
            return t.reshape(len(tokens), num_heads, head_dim).transpose(1, 0, 2)

        Qh, Kh, Vh = split_heads(Q), split_heads(K), split_heads(V)

        def attention(q, k, v):
            scores = q @ k.T / np.sqrt(k.shape[-1])
            weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
            weights /= np.sum(weights, axis=-1, keepdims=True)
            return weights @ v, weights

        outputs = []
        for i in range(num_heads):
            out, weights = attention(Qh[i], Kh[i], Vh[i])
            with st.expander(f"Head {i+1}"):
                st.write("Q:", Qh[i])
                st.write("K:", Kh[i])
                st.write("V:", Vh[i])
                st.write("Attention Weights:", weights)
                fig, ax = plt.subplots()
                sns.heatmap(weights, cmap="Blues", ax=ax)
                ax.set_title("Attention Weights Heatmap")
                st.pyplot(fig)
            outputs.append(out)

        final = np.concatenate(outputs, axis=-1)
        with st.expander("🧩 Concatenated Output"):
            st.write(final)

with st.expander("📊 Transformer and GPT Model Component Comparison (Table)", expanded=True):
    st.markdown("""
    | Parameter                         | Original Transformer (2017) | GPT-2 (2019)           | GPT-3 (2020)            | GPT-4 (2023, est.)         |
    |----------------------------------|------------------------------|-------------------------|--------------------------|----------------------------|
    | **Max Context Length (tokens)**  | 512                          | 1024                    | 2048                     | 8192 / 32,768              |
    | **Vocab Size**                   | ~37,000 (BPE)                | 50,257                  | 50,257                   | ~100,000 (multimodal-aware) |
    | **Embedding Dimension (D)**      | 512                          | 768 – 1600              | 12,288                   | 12,288+                    |
    | **Layers / Transformer Blocks**  | 6 (base), 12 (large)         | 12 – 48 (XL)            | 96                       | ~120 – 160 (est.)         |
    | **Self-Attention Heads**         | 8                            | 12 – 25                 | 96                       | 120 – 128+ (est.)         |
    | **Dim per Attention Head**       | 64                           | 64                      | 128                      | ~128                      |
    | **Batch Size (training)**        | ~25k tokens                  | ~512 – 2048 tokens      | ~3.2M tokens             | Multi-million tokens (est.) |
    | **Tensor Shape**                 | [Batch, Tokens, Dim]         | Same                    | Same                     | Same                      |
    | **Parameters (Total)**           | ~65M                         | 124M – 1.5B             | 175B                     | ~500B – 1T+ (speculative)  |

    **Explanations:**
    - **Context Length**: Max number of tokens the model can see at once.
    - **Embedding Dim**: Size of token vectors.
    - **Layers**: Depth of the network (attention + FFN).
    - **Heads**: Parallel attention mechanisms.
    - **Dim per Head**: Each head gets a slice of the full embedding.
    - **Tensor Shape**: Internal model shape: [Batch, Tokens, Embedding].
    """)