ToyUniversalInterpreter

Sleeping

App Files Files Community

Phoenix21 commited on 25 days ago

Commit

3a924ef

verified ·

1 Parent(s): 1c6672e

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -53

app.py CHANGED Viewed

@@ -1,69 +1,68 @@
 import torch
 import gradio as gr
 import networkx as nx
 import matplotlib.pyplot as plt
-from transformers import GPT2Model, GPT2Tokenizer
-from sklearn.cluster import KMeans
-# 1. Load a real small model
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_name = "gpt2" # 124M parameters
-tokenizer = GPT2Tokenizer.from_pretrained(model_name)
-model = GPT2Model.from_pretrained(model_name).to(device)
-def get_hidden_state(sequence_str):
-    inputs = tokenizer(sequence_str, return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = model(**inputs, output_hidden_states=True)
-    # Use the last hidden state of the last token
-    return outputs.hidden_states[-1][0, -1, :].cpu().numpy()
-def analyze_dfa(input_text):
-    """
-    Simulates a 'State Probe'.
-    Input: 'Right, Up, Left'
-    Logic: Generates a graph showing how the model's internal representation
-    changes with each move.
-    """
-    moves = [m.strip() for m in input_text.split(",")]
-    history = ""
-    states_vectors = []
-    # Track the "path" through the model's internal space
-    for move in moves:
-        history += f" Move {move}."
-        vec = get_hidden_state(history)
-        states_vectors.append(vec)
-    # Clustering: Vafa's Compression metric
-    # We cluster activations to see which moves the model thinks are 'equivalent'
-    num_clusters = min(len(moves), 4)
-    kmeans = KMeans(n_clusters=num_clusters, n_init=10).fit(states_vectors)
-    labels = kmeans.labels_
-    # Build the DFA Graph
-    G = nx.DiGraph()
-    for i in range(len(moves)-1):
-        u, v = f"S{labels[i]}", f"S{labels[i+1]}"
-        G.add_edge(u, v, label=moves[i+1])
     # Draw the DFA
-    plt.figure(figsize=(6, 4))
     pos = nx.spring_layout(G)
-    nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=2000)
-    edge_labels = nx.get_edge_attributes(G, 'label')
-    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
-    plt.savefig("dfa_plot.png")
-    return "dfa_plot.png", f"Found {num_clusters} distinct internal states."
-# 3. Gradio Interface
-demo = gr.Interface(
-    fn=analyze_dfa,
-    inputs=gr.Textbox(placeholder="Enter moves separated by commas, e.g.: Right, Up, Left, Down"),
-    outputs=[gr.Image(label="Extracted Model DFA"), gr.Text(label="Analysis")],
-    title="World Model DFA Extractor",
-    description="This tool probes GPT-2's internal activations to see if it treats different move sequences as the same 'State'."
-)
 demo.launch()

 import torch
 import gradio as gr
+from transformers import AutoModel, AutoTokenizer
+from datasets import load_dataset
+from sklearn.cluster import KMeans
 import networkx as nx
 import matplotlib.pyplot as plt
+# 1. Configuration for Models & Datasets
+MODELS = ["gpt2", "distilgpt2", "qwen/Qwen2.5-0.5B", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
+DATASETS = ["wikitext", "tinystories", "ag_news"]
+def analyze_world_model(model_name, dataset_name, num_samples=20):
+    # Load Model & Tokenizer
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name).to(device)
+    # Load Dataset
+    ds = load_dataset(dataset_name, split='train', streaming=True).take(num_samples)
+    all_hidden_states = []
+    labels = []
+    # Step A: The Probe (Keplerian Observation)
+    for i, example in enumerate(ds):
+        text = example['text'][:100] # Use a snippet
+        inputs = tokenizer(text, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True)
+            # Take the last hidden state of the sequence
+            state = outputs.hidden_states[-1][0, -1, :].cpu().numpy()
+            all_hidden_states.append(state)
+            labels.append(f"Seq_{i}")
+    # Step B: Myhill-Nerode Clustering (Newtonian Recovery)
+    # We cluster to find 'Equivalence Classes' (Internal States)
+    n_clusters = min(len(all_hidden_states), 5)
+    kmeans = KMeans(n_clusters=n_clusters, n_init=10).fit(all_hidden_states)
+    state_assignments = kmeans.labels_
+    # Step C: DFA Reconstruction
+    G = nx.DiGraph()
+    for i in range(len(state_assignments) - 1):
+        u, v = f"S{state_assignments[i]}", f"S{state_assignments[i+1]}"
+        G.add_edge(u, v, label=f"Next_{i}")
     # Draw the DFA
+    plt.figure(figsize=(8, 6))
     pos = nx.spring_layout(G)
+    nx.draw(G, pos, with_labels=True, node_color='orange', node_size=3000, font_weight='bold')
+    plt.savefig("dfa_output.png")
+    return "dfa_output.png", f"Model '{model_name}' reduced this dataset into {n_clusters} distinct internal states."
+# 3. Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# The Universal Newtonian Probe")
+    with gr.Row():
+        m_drop = gr.Dropdown(choices=MODELS, label="Select Model", value="gpt2")
+        d_drop = gr.Dropdown(choices=DATASETS, label="Select Dataset", value="wikitext")
+    btn = gr.Button("Analyze Coherence")
+    out_img = gr.Image(label="Extracted DFA")
+    out_txt = gr.Textbox(label="Analysis Result")
+    btn.click(analyze_world_model, inputs=[m_drop, d_drop], outputs=[out_img, out_txt])
 demo.launch()