Spaces:

OpenTransformer
/

ar-vs-sat

Sleeping

App Files Files Community

OpenTransformer commited on Jan 20

Commit

19e9060

verified ·

1 Parent(s): 08d6098

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +38 -17

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 # Load model once
 print("Loading GPT-2...")
@@ -59,9 +60,34 @@ def forced_sat_generate(prompt, n_tokens=50, block_size=2):
 def compare(prompt, n_tokens):
     n_tokens = int(n_tokens)
     ar_output = ar_generate(prompt, n_tokens)
     sat_output = forced_sat_generate(prompt, n_tokens)
-    return ar_output, sat_output
 # Gradio interface
 with gr.Blocks(title="AR vs Forced SAT") as demo:
@@ -70,12 +96,6 @@ with gr.Blocks(title="AR vs Forced SAT") as demo:
     **Can AR models be forced to output 2 tokens at once?**
-    - **AR (Autoregressive):** Standard 1-token-at-a-time generation
-    - **Forced SAT:** Outputs 2 tokens per step using stale context for token 2
-    Forced SAT runs ~2x faster but produces degraded output because AR hidden states
-    don't encode multi-token futures. Joint AR+SAT training is required for quality SAT inference.
     Model: GPT-2 (124M params)
     """)
@@ -85,11 +105,17 @@ with gr.Blocks(title="AR vs Forced SAT") as demo:
     btn = gr.Button("Generate", variant="primary")
     with gr.Row():
-        ar_output = gr.Textbox(label="AR Output (baseline)", lines=5)
-        sat_output = gr.Textbox(label="Forced SAT v1 (2x speed, degraded)", lines=5)
-    btn.click(compare, inputs=[prompt, n_tokens], outputs=[ar_output, sat_output])
     gr.Examples(
         examples=[
@@ -97,20 +123,15 @@ with gr.Blocks(title="AR vs Forced SAT") as demo:
             ["In the beginning", 40],
             ["Once upon a time", 40],
             ["Machine learning is", 40],
-            ["The president announced that", 40],
         ],
         inputs=[prompt, n_tokens],
     )
     gr.Markdown("""
     ---
-    **Why Forced SAT fails:** AR hidden states at position N only encode "next token N+1".
-    There's no representation for token N+2. Forcing 2-token output uses stale context,
-    creating alternating good/bad tokens.
-    **Solution:** Train AR+SAT jointly from scratch so representations encode multiple future tokens.
-    See: [AGILLM-3](https://huggingface.co/OpenTransformer/AGILLM-3-large) | [Experiment Code](https://huggingface.co/OpenTransformer/sat-retrofit-experiment)
     *OpenTransformers Ltd - Scott Bisset*
     """)

 import gradio as gr
 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
+import time
 # Load model once
 print("Loading GPT-2...")
 def compare(prompt, n_tokens):
     n_tokens = int(n_tokens)
+    # AR with timing
+    if device == "cuda":
+        torch.cuda.synchronize()
+    start = time.perf_counter()
     ar_output = ar_generate(prompt, n_tokens)
+    if device == "cuda":
+        torch.cuda.synchronize()
+    ar_time = time.perf_counter() - start
+    ar_tps = n_tokens / ar_time
+    # SAT with timing
+    if device == "cuda":
+        torch.cuda.synchronize()
+    start = time.perf_counter()
     sat_output = forced_sat_generate(prompt, n_tokens)
+    if device == "cuda":
+        torch.cuda.synchronize()
+    sat_time = time.perf_counter() - start
+    sat_tps = n_tokens / sat_time
+    speedup = ar_time / sat_time if sat_time > 0 else 0
+    ar_label = f"AR Output - {ar_tps:.1f} tok/s"
+    sat_label = f"Forced SAT - {sat_tps:.1f} tok/s"
+    speedup_text = f"## Speedup: {speedup:.2f}x"
+    return ar_output, sat_output, ar_label, sat_label, speedup_text
 # Gradio interface
 with gr.Blocks(title="AR vs Forced SAT") as demo:
     **Can AR models be forced to output 2 tokens at once?**
     Model: GPT-2 (124M params)
     """)
     btn = gr.Button("Generate", variant="primary")
+    speedup_display = gr.Markdown("## Speedup: ?x")
     with gr.Row():
+        ar_label = gr.Markdown("### AR Output - ? tok/s")
+        sat_label = gr.Markdown("### Forced SAT - ? tok/s")
+    with gr.Row():
+        ar_output = gr.Textbox(label="", lines=5, show_label=False)
+        sat_output = gr.Textbox(label="", lines=5, show_label=False)
+    btn.click(compare, inputs=[prompt, n_tokens], outputs=[ar_output, sat_output, ar_label, sat_label, speedup_display])
     gr.Examples(
         examples=[
             ["In the beginning", 40],
             ["Once upon a time", 40],
             ["Machine learning is", 40],
         ],
         inputs=[prompt, n_tokens],
     )
     gr.Markdown("""
     ---
+    **Why Forced SAT fails quality:** AR hidden states only encode "next token". Forcing 2-token output uses stale context.
+    **Solution:** Joint AR+SAT training from scratch. See [AGILLM-3](https://huggingface.co/OpenTransformer/AGILLM-3-large)
     *OpenTransformers Ltd - Scott Bisset*
     """)