xinliucs commited on
Commit
6736c46
·
verified ·
1 Parent(s): a07ece3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+
4
+ # Paper Information
5
+ PAPER_TITLE = "Answer Convergence as a Signal for Early Stopping in Reasoning"
6
+ AUTHORS = "Xin Liu, Lu Wang (University of Michigan)"
7
+ GITHUB_LINK = "https://github.com/launchnlp/reasoning_earlystop"
8
+
9
+ # [cite_start]Case Data: Sourced from Paper Figure 1 [cite: 33-34] [cite_start]and Table 1 [cite: 440-441]
10
+ examples = {
11
+ "Case 1: Record Sales (GSM8K) - Early Convergence": {
12
+ "question": "Marilyn's first record sold 10 times as many copies as Harald's. If they sold 88,000 copies combined, how many copies did Harald sell?",
13
+ "full_reasoning": """<think> Okay, so I need to figure out how many copies Harald sold.
14
+ So 11H = 88,000 I need to solve for it by dividing both sides by 11
15
+ 88,000 divided by 11 should be 8,000.
16
+ Wait, let me double-check that.
17
+ If I multiply 11 by 8,000, I should get 88,000. That checks out.
18
+ I don't think I made any mistakes here.
19
+ So Harald sold 8,000 copies.
20
+ </think> 8000""",
21
+ "early_stop_reasoning": """<think> Okay, so I need to figure out how many copies Harald sold.
22
+ So 11H = 88,000 I need to solve for it by dividing both sides by 11
23
+ 88,000 divided by 11 should be 8,000.
24
+ </think> 8000""",
25
+ "savings": "Result: The model converged early. Later steps were redundant."
26
+ },
27
+ "Case 2: Lemon Tree (GSM8K) - Token Savings": {
28
+ "question": "Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?",
29
+ "full_reasoning": """To determine how many years... Solving for n: n=$90/$7.5, n=12.
30
+ Wait, let me double-check my calculations to make sure I didn't make a mistake... That seems correct.
31
+ Checking the revenue vs cost again...
32
+ Everything looks fine.
33
+ </think> \\boxed{12}""",
34
+ "early_stop_reasoning": """Okay, so Carlos is planting... Calculating that, 90 divided by 7.5 equals 12.
35
+ </think> \\boxed{12}""",
36
+ "savings": "Efficiency Gain: ~23% Token Reduction (439 -> 338 tokens)."
37
+ }
38
+ }
39
+
40
+ def simulate_generation(case_name):
41
+ """Simulates the generation process to visualize the comparison."""
42
+ case = examples[case_name]
43
+ # Initial state
44
+ yield case["question"], "", "", "Initializing..."
45
+
46
+ full_text = case["full_reasoning"]
47
+ stop_text = case["early_stop_reasoning"]
48
+
49
+ current_full = ""
50
+ current_stop = ""
51
+
52
+ max_len = max(len(full_text), len(stop_text))
53
+
54
+ # Simulate streaming output (typewriter effect)
55
+ step_size = 5
56
+ for i in range(0, max_len, step_size):
57
+ # Update Full CoT
58
+ if i < len(full_text):
59
+ current_full = full_text[:i]
60
+ else:
61
+ current_full = full_text
62
+
63
+ # Update Early Stop CoT
64
+ if i < len(stop_text):
65
+ current_stop = stop_text[:i]
66
+ else:
67
+ current_stop = stop_text
68
+
69
+ # Determine status message
70
+ status = "Generating..."
71
+ if i >= len(stop_text) and i < len(full_text):
72
+ status = "⚡ Early Stopping Triggered! (Saving Compute) ⚡"
73
+ elif i >= len(full_text):
74
+ status = "Done."
75
+
76
+ yield case["question"], current_full, current_stop, status
77
+ time.sleep(0.05)
78
+
79
+ # Final yield to ensure complete text is shown
80
+ yield case["question"], full_text, stop_text, f"Done! {case['savings']}"
81
+
82
+ # Build the Gradio Interface
83
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
84
+ # Header Section
85
+ gr.Markdown(f"# 🛑 {PAPER_TITLE}")
86
+ gr.Markdown(f"**Authors:** {AUTHORS}")
87
+ gr.Markdown(f"**Code & Resources:** [GitHub Repository]({GITHUB_LINK}) | [ArXiv Paper](https://arxiv.org/abs/2506.02536)")
88
+
89
+ gr.Markdown("""
90
+ ### 💡 Demo Description
91
+ This interactive demo illustrates the core concept of our **Early Stopping** strategy.
92
+ * **Left Panel:** Shows the model's full Chain-of-Thought (CoT) reasoning process.
93
+ * **Right Panel:** Shows the reasoning process truncated by our method.
94
+
95
+ **Key Insight:** Models often reach **Answer Convergence** (the correct answer) well before completing the full reasoning chain. Subsequent steps are often redundant self-verification, which can be safely skipped to reduce inference costs.
96
+ """)
97
+
98
+ # Control Section
99
+ with gr.Row():
100
+ case_dropdown = gr.Dropdown(
101
+ choices=list(examples.keys()),
102
+ value="Case 1: Record Sales (GSM8K) - Early Convergence",
103
+ label="Select a Test Case"
104
+ )
105
+ run_btn = gr.Button("▶️ Run Simulation", variant="primary")
106
+
107
+ status_bar = gr.Textbox(label="Status", value="Ready to run", interactive=False)
108
+
109
+ # Display Section
110
+ with gr.Row():
111
+ with gr.Column():
112
+ gr.Markdown("### 🐢 Original (Full CoT)")
113
+ full_output = gr.Textbox(label="Standard Generation", lines=12)
114
+ with gr.Column():
115
+ gr.Markdown("### 🐇 Our Method (Early Stopping)")
116
+ stop_output = gr.Textbox(label="Early Stopping Generation", lines=12)
117
+
118
+ # Event Listener
119
+ run_btn.click(
120
+ fn=simulate_generation,
121
+ inputs=case_dropdown,
122
+ outputs=[gr.Textbox(visible=False), full_output, stop_output, status_bar]
123
+ )
124
+
125
+ # Results Footer
126
+ gr.Markdown("""
127
+ ---
128
+ ### 📊 Key Results (from Paper)
129
+
130
+ Our experiments across five benchmarks (including NQ, GSM8K, GPQA) reveal substantial redundancy in standard CoT:
131
+
132
+ * **NaturalQuestions (NQ):** Token reduction of over **40%** with improved accuracy using *Learn-to-Stop*.
133
+ * **GSM8K:** Token reduction of **~45%** with minimal or no accuracy drop.
134
+ * **Methods:** We propose three strategies: *Answer Consistency* (Unsupervised), *Think Token Adjustment* (Unsupervised), and *Learn-to-Stop* (Supervised).
135
+ """)
136
+
137
+ if __name__ == "__main__":
138
+ demo.launch()