import gradio as gr import time # Paper Information PAPER_TITLE = "Answer Convergence as a Signal for Early Stopping in Reasoning" AUTHORS = "Xin Liu, Lu Wang (University of Michigan)" GITHUB_LINK = "https://github.com/launchnlp/reasoning_earlystop" # [cite_start]Case Data: Sourced from Paper Figure 1 [cite: 33-34] [cite_start]and Table 1 [cite: 440-441] examples = { "Case 1: Record Sales (GSM8K) - Early Convergence": { "question": "Marilyn's first record sold 10 times as many copies as Harald's. If they sold 88,000 copies combined, how many copies did Harald sell?", "full_reasoning": """ Okay, so I need to figure out how many copies Harald sold. So 11H = 88,000 I need to solve for it by dividing both sides by 11 88,000 divided by 11 should be 8,000. Wait, let me double-check that. If I multiply 11 by 8,000, I should get 88,000. That checks out. I don't think I made any mistakes here. So Harald sold 8,000 copies. 8000""", "early_stop_reasoning": """ Okay, so I need to figure out how many copies Harald sold. So 11H = 88,000 I need to solve for it by dividing both sides by 11 88,000 divided by 11 should be 8,000. 8000""", "savings": "Result: The model converged early. Later steps were redundant." }, "Case 2: Lemon Tree (GSM8K) - Token Savings": { "question": "Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?", "full_reasoning": """To determine how many years... Solving for n: n=$90/$7.5, n=12. Wait, let me double-check my calculations to make sure I didn't make a mistake... That seems correct. Checking the revenue vs cost again... Everything looks fine. \\boxed{12}""", "early_stop_reasoning": """Okay, so Carlos is planting... Calculating that, 90 divided by 7.5 equals 12. \\boxed{12}""", "savings": "Efficiency Gain: ~23% Token Reduction (439 -> 338 tokens)." } } def simulate_generation(case_name): """Simulates the generation process to visualize the comparison.""" case = examples[case_name] # Initial state yield case["question"], "", "", "Initializing..." full_text = case["full_reasoning"] stop_text = case["early_stop_reasoning"] current_full = "" current_stop = "" max_len = max(len(full_text), len(stop_text)) # Simulate streaming output (typewriter effect) step_size = 5 for i in range(0, max_len, step_size): # Update Full CoT if i < len(full_text): current_full = full_text[:i] else: current_full = full_text # Update Early Stop CoT if i < len(stop_text): current_stop = stop_text[:i] else: current_stop = stop_text # Determine status message status = "Generating..." if i >= len(stop_text) and i < len(full_text): status = "⚡ Early Stopping Triggered! (Saving Compute) ⚡" elif i >= len(full_text): status = "Done." yield case["question"], current_full, current_stop, status time.sleep(0.05) # Final yield to ensure complete text is shown yield case["question"], full_text, stop_text, f"Done! {case['savings']}" # Build the Gradio Interface with gr.Blocks(theme=gr.themes.Soft()) as demo: # Header Section gr.Markdown(f"# 🛑 {PAPER_TITLE}") gr.Markdown(f"**Authors:** {AUTHORS}") gr.Markdown(f"**Code & Resources:** [GitHub Repository]({GITHUB_LINK}) | [ArXiv Paper](https://arxiv.org/abs/2506.02536)") gr.Markdown(""" ### 💡 Demo Description This interactive demo illustrates the core concept of our **Early Stopping** strategy. * **Left Panel:** Shows the model's full Chain-of-Thought (CoT) reasoning process. * **Right Panel:** Shows the reasoning process truncated by our method. **Key Insight:** Models often reach **Answer Convergence** (the correct answer) well before completing the full reasoning chain. Subsequent steps are often redundant self-verification, which can be safely skipped to reduce inference costs. """) # Control Section with gr.Row(): case_dropdown = gr.Dropdown( choices=list(examples.keys()), value="Case 1: Record Sales (GSM8K) - Early Convergence", label="Select a Test Case" ) run_btn = gr.Button("▶️ Run Simulation", variant="primary") status_bar = gr.Textbox(label="Status", value="Ready to run", interactive=False) # Display Section with gr.Row(): with gr.Column(): gr.Markdown("### 🐢 Original (Full CoT)") full_output = gr.Textbox(label="Standard Generation", lines=12) with gr.Column(): gr.Markdown("### 🐇 Our Method (Early Stopping)") stop_output = gr.Textbox(label="Early Stopping Generation", lines=12) # Event Listener run_btn.click( fn=simulate_generation, inputs=case_dropdown, outputs=[gr.Textbox(visible=False), full_output, stop_output, status_bar] ) # Results Footer gr.Markdown(""" --- ### 📊 Key Results (from Paper) Our experiments across five benchmarks (including NQ, GSM8K, GPQA) reveal substantial redundancy in standard CoT: * **NaturalQuestions (NQ):** Token reduction of over **40%** with improved accuracy using *Learn-to-Stop*. * **GSM8K:** Token reduction of **~45%** with minimal or no accuracy drop. * **Methods:** We propose three strategies: *Answer Consistency* (Unsupervised), *Think Token Adjustment* (Unsupervised), and *Learn-to-Stop* (Supervised). """) if __name__ == "__main__": demo.launch()