Spaces:
Sleeping
Sleeping
File size: 5,967 Bytes
6736c46 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import gradio as gr
import time
# Paper Information
PAPER_TITLE = "Answer Convergence as a Signal for Early Stopping in Reasoning"
AUTHORS = "Xin Liu, Lu Wang (University of Michigan)"
GITHUB_LINK = "https://github.com/launchnlp/reasoning_earlystop"
# [cite_start]Case Data: Sourced from Paper Figure 1 [cite: 33-34] [cite_start]and Table 1 [cite: 440-441]
examples = {
"Case 1: Record Sales (GSM8K) - Early Convergence": {
"question": "Marilyn's first record sold 10 times as many copies as Harald's. If they sold 88,000 copies combined, how many copies did Harald sell?",
"full_reasoning": """<think> Okay, so I need to figure out how many copies Harald sold.
So 11H = 88,000 I need to solve for it by dividing both sides by 11
88,000 divided by 11 should be 8,000.
Wait, let me double-check that.
If I multiply 11 by 8,000, I should get 88,000. That checks out.
I don't think I made any mistakes here.
So Harald sold 8,000 copies.
</think> 8000""",
"early_stop_reasoning": """<think> Okay, so I need to figure out how many copies Harald sold.
So 11H = 88,000 I need to solve for it by dividing both sides by 11
88,000 divided by 11 should be 8,000.
</think> 8000""",
"savings": "Result: The model converged early. Later steps were redundant."
},
"Case 2: Lemon Tree (GSM8K) - Token Savings": {
"question": "Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?",
"full_reasoning": """To determine how many years... Solving for n: n=$90/$7.5, n=12.
Wait, let me double-check my calculations to make sure I didn't make a mistake... That seems correct.
Checking the revenue vs cost again...
Everything looks fine.
</think> \\boxed{12}""",
"early_stop_reasoning": """Okay, so Carlos is planting... Calculating that, 90 divided by 7.5 equals 12.
</think> \\boxed{12}""",
"savings": "Efficiency Gain: ~23% Token Reduction (439 -> 338 tokens)."
}
}
def simulate_generation(case_name):
"""Simulates the generation process to visualize the comparison."""
case = examples[case_name]
# Initial state
yield case["question"], "", "", "Initializing..."
full_text = case["full_reasoning"]
stop_text = case["early_stop_reasoning"]
current_full = ""
current_stop = ""
max_len = max(len(full_text), len(stop_text))
# Simulate streaming output (typewriter effect)
step_size = 5
for i in range(0, max_len, step_size):
# Update Full CoT
if i < len(full_text):
current_full = full_text[:i]
else:
current_full = full_text
# Update Early Stop CoT
if i < len(stop_text):
current_stop = stop_text[:i]
else:
current_stop = stop_text
# Determine status message
status = "Generating..."
if i >= len(stop_text) and i < len(full_text):
status = "β‘ Early Stopping Triggered! (Saving Compute) β‘"
elif i >= len(full_text):
status = "Done."
yield case["question"], current_full, current_stop, status
time.sleep(0.05)
# Final yield to ensure complete text is shown
yield case["question"], full_text, stop_text, f"Done! {case['savings']}"
# Build the Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
# Header Section
gr.Markdown(f"# π {PAPER_TITLE}")
gr.Markdown(f"**Authors:** {AUTHORS}")
gr.Markdown(f"**Code & Resources:** [GitHub Repository]({GITHUB_LINK}) | [ArXiv Paper](https://arxiv.org/abs/2506.02536)")
gr.Markdown("""
### π‘ Demo Description
This interactive demo illustrates the core concept of our **Early Stopping** strategy.
* **Left Panel:** Shows the model's full Chain-of-Thought (CoT) reasoning process.
* **Right Panel:** Shows the reasoning process truncated by our method.
**Key Insight:** Models often reach **Answer Convergence** (the correct answer) well before completing the full reasoning chain. Subsequent steps are often redundant self-verification, which can be safely skipped to reduce inference costs.
""")
# Control Section
with gr.Row():
case_dropdown = gr.Dropdown(
choices=list(examples.keys()),
value="Case 1: Record Sales (GSM8K) - Early Convergence",
label="Select a Test Case"
)
run_btn = gr.Button("βΆοΈ Run Simulation", variant="primary")
status_bar = gr.Textbox(label="Status", value="Ready to run", interactive=False)
# Display Section
with gr.Row():
with gr.Column():
gr.Markdown("### π’ Original (Full CoT)")
full_output = gr.Textbox(label="Standard Generation", lines=12)
with gr.Column():
gr.Markdown("### π Our Method (Early Stopping)")
stop_output = gr.Textbox(label="Early Stopping Generation", lines=12)
# Event Listener
run_btn.click(
fn=simulate_generation,
inputs=case_dropdown,
outputs=[gr.Textbox(visible=False), full_output, stop_output, status_bar]
)
# Results Footer
gr.Markdown("""
---
### π Key Results (from Paper)
Our experiments across five benchmarks (including NQ, GSM8K, GPQA) reveal substantial redundancy in standard CoT:
* **NaturalQuestions (NQ):** Token reduction of over **40%** with improved accuracy using *Learn-to-Stop*.
* **GSM8K:** Token reduction of **~45%** with minimal or no accuracy drop.
* **Methods:** We propose three strategies: *Answer Consistency* (Unsupervised), *Think Token Adjustment* (Unsupervised), and *Learn-to-Stop* (Supervised).
""")
if __name__ == "__main__":
demo.launch() |