File size: 5,967 Bytes
6736c46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
import time

# Paper Information
PAPER_TITLE = "Answer Convergence as a Signal for Early Stopping in Reasoning"
AUTHORS = "Xin Liu, Lu Wang (University of Michigan)"
GITHUB_LINK = "https://github.com/launchnlp/reasoning_earlystop"

# [cite_start]Case Data: Sourced from Paper Figure 1 [cite: 33-34] [cite_start]and Table 1 [cite: 440-441]
examples = {
    "Case 1: Record Sales (GSM8K) - Early Convergence": {
        "question": "Marilyn's first record sold 10 times as many copies as Harald's. If they sold 88,000 copies combined, how many copies did Harald sell?",
        "full_reasoning": """<think> Okay, so I need to figure out how many copies Harald sold.
So 11H = 88,000 I need to solve for it by dividing both sides by 11
88,000 divided by 11 should be 8,000.
Wait, let me double-check that.
If I multiply 11 by 8,000, I should get 88,000. That checks out.
I don't think I made any mistakes here.
So Harald sold 8,000 copies.
</think> 8000""",
        "early_stop_reasoning": """<think> Okay, so I need to figure out how many copies Harald sold.
So 11H = 88,000 I need to solve for it by dividing both sides by 11
88,000 divided by 11 should be 8,000.
</think> 8000""",
        "savings": "Result: The model converged early. Later steps were redundant."
    },
    "Case 2: Lemon Tree (GSM8K) - Token Savings": {
        "question": "Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?",
        "full_reasoning": """To determine how many years... Solving for n: n=$90/$7.5, n=12.
Wait, let me double-check my calculations to make sure I didn't make a mistake... That seems correct.
Checking the revenue vs cost again...
Everything looks fine.
</think> \\boxed{12}""",
        "early_stop_reasoning": """Okay, so Carlos is planting... Calculating that, 90 divided by 7.5 equals 12.
</think> \\boxed{12}""",
        "savings": "Efficiency Gain: ~23% Token Reduction (439 -> 338 tokens)."
    }
}

def simulate_generation(case_name):
    """Simulates the generation process to visualize the comparison."""
    case = examples[case_name]
    # Initial state
    yield case["question"], "", "", "Initializing..."
    
    full_text = case["full_reasoning"]
    stop_text = case["early_stop_reasoning"]
    
    current_full = ""
    current_stop = ""
    
    max_len = max(len(full_text), len(stop_text))
    
    # Simulate streaming output (typewriter effect)
    step_size = 5 
    for i in range(0, max_len, step_size):
        # Update Full CoT
        if i < len(full_text):
            current_full = full_text[:i]
        else:
            current_full = full_text
            
        # Update Early Stop CoT
        if i < len(stop_text):
            current_stop = stop_text[:i]
        else:
            current_stop = stop_text 
        
        # Determine status message
        status = "Generating..."
        if i >= len(stop_text) and i < len(full_text):
            status = "⚑ Early Stopping Triggered! (Saving Compute) ⚑"
        elif i >= len(full_text):
            status = "Done."
            
        yield case["question"], current_full, current_stop, status
        time.sleep(0.05)
    
    # Final yield to ensure complete text is shown
    yield case["question"], full_text, stop_text, f"Done! {case['savings']}"

# Build the Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    # Header Section
    gr.Markdown(f"# πŸ›‘ {PAPER_TITLE}")
    gr.Markdown(f"**Authors:** {AUTHORS}")
    gr.Markdown(f"**Code & Resources:** [GitHub Repository]({GITHUB_LINK}) | [ArXiv Paper](https://arxiv.org/abs/2506.02536)")
    
    gr.Markdown("""
    ### πŸ’‘ Demo Description
    This interactive demo illustrates the core concept of our **Early Stopping** strategy. 
    * **Left Panel:** Shows the model's full Chain-of-Thought (CoT) reasoning process.
    * **Right Panel:** Shows the reasoning process truncated by our method.
    
    **Key Insight:** Models often reach **Answer Convergence** (the correct answer) well before completing the full reasoning chain. Subsequent steps are often redundant self-verification, which can be safely skipped to reduce inference costs.
    """)
    
    # Control Section
    with gr.Row():
        case_dropdown = gr.Dropdown(
            choices=list(examples.keys()), 
            value="Case 1: Record Sales (GSM8K) - Early Convergence", 
            label="Select a Test Case"
        )
        run_btn = gr.Button("▢️ Run Simulation", variant="primary")
    
    status_bar = gr.Textbox(label="Status", value="Ready to run", interactive=False)
    
    # Display Section
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🐒 Original (Full CoT)")
            full_output = gr.Textbox(label="Standard Generation", lines=12)
        with gr.Column():
            gr.Markdown("### πŸ‡ Our Method (Early Stopping)")
            stop_output = gr.Textbox(label="Early Stopping Generation", lines=12)
            
    # Event Listener
    run_btn.click(
        fn=simulate_generation,
        inputs=case_dropdown,
        outputs=[gr.Textbox(visible=False), full_output, stop_output, status_bar]
    )

    # Results Footer
    gr.Markdown("""
    ---
    ### πŸ“Š Key Results (from Paper)
    
    Our experiments across five benchmarks (including NQ, GSM8K, GPQA) reveal substantial redundancy in standard CoT:
    
    * **NaturalQuestions (NQ):** Token reduction of over **40%** with improved accuracy using *Learn-to-Stop*.
    * **GSM8K:** Token reduction of **~45%** with minimal or no accuracy drop.
    * **Methods:** We propose three strategies: *Answer Consistency* (Unsupervised), *Think Token Adjustment* (Unsupervised), and *Learn-to-Stop* (Supervised).
    """)

if __name__ == "__main__":
    demo.launch()