Spaces:

launch
/

reasoning_earlystop

Running

App Files Files Community

reasoning_earlystop / app.py

xinliucs

Upload app.py

6736c46 verified 26 days ago

raw

history blame contribute delete

5.97 kB

	import gradio as gr
	import time

	# Paper Information
	PAPER_TITLE = "Answer Convergence as a Signal for Early Stopping in Reasoning"
	AUTHORS = "Xin Liu, Lu Wang (University of Michigan)"
	GITHUB_LINK = "https://github.com/launchnlp/reasoning_earlystop"

	# [cite_start]Case Data: Sourced from Paper Figure 1 [cite: 33-34] [cite_start]and Table 1 [cite: 440-441]
	examples = {
	"Case 1: Record Sales (GSM8K) - Early Convergence": {
	"question": "Marilyn's first record sold 10 times as many copies as Harald's. If they sold 88,000 copies combined, how many copies did Harald sell?",
	"full_reasoning": """<think> Okay, so I need to figure out how many copies Harald sold.
	So 11H = 88,000 I need to solve for it by dividing both sides by 11
	88,000 divided by 11 should be 8,000.
	Wait, let me double-check that.
	If I multiply 11 by 8,000, I should get 88,000. That checks out.
	I don't think I made any mistakes here.
	So Harald sold 8,000 copies.
	</think> 8000""",
	"early_stop_reasoning": """<think> Okay, so I need to figure out how many copies Harald sold.
	So 11H = 88,000 I need to solve for it by dividing both sides by 11
	88,000 divided by 11 should be 8,000.
	</think> 8000""",
	"savings": "Result: The model converged early. Later steps were redundant."
	},
	"Case 2: Lemon Tree (GSM8K) - Token Savings": {
	"question": "Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?",
	"full_reasoning": """To determine how many years... Solving for n: n=$90/$7.5, n=12.
	Wait, let me double-check my calculations to make sure I didn't make a mistake... That seems correct.
	Checking the revenue vs cost again...
	Everything looks fine.
	</think> \\boxed{12}""",
	"early_stop_reasoning": """Okay, so Carlos is planting... Calculating that, 90 divided by 7.5 equals 12.
	</think> \\boxed{12}""",
	"savings": "Efficiency Gain: ~23% Token Reduction (439 -> 338 tokens)."
	}
	}

	def simulate_generation(case_name):
	"""Simulates the generation process to visualize the comparison."""
	case = examples[case_name]
	# Initial state
	yield case["question"], "", "", "Initializing..."

	full_text = case["full_reasoning"]
	stop_text = case["early_stop_reasoning"]

	current_full = ""
	current_stop = ""

	max_len = max(len(full_text), len(stop_text))

	# Simulate streaming output (typewriter effect)
	step_size = 5
	for i in range(0, max_len, step_size):
	# Update Full CoT
	if i < len(full_text):
	current_full = full_text[:i]
	else:
	current_full = full_text

	# Update Early Stop CoT
	if i < len(stop_text):
	current_stop = stop_text[:i]
	else:
	current_stop = stop_text

	# Determine status message
	status = "Generating..."
	if i >= len(stop_text) and i < len(full_text):
	status = "⚡ Early Stopping Triggered! (Saving Compute) ⚡"
	elif i >= len(full_text):
	status = "Done."

	yield case["question"], current_full, current_stop, status
	time.sleep(0.05)

	# Final yield to ensure complete text is shown
	yield case["question"], full_text, stop_text, f"Done! {case['savings']}"

	# Build the Gradio Interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	# Header Section
	gr.Markdown(f"# 🛑 {PAPER_TITLE}")
	gr.Markdown(f"Authors: {AUTHORS}")
	gr.Markdown(f"Code & Resources: [GitHub Repository]({GITHUB_LINK}) \| [ArXiv Paper](https://arxiv.org/abs/2506.02536)")

	gr.Markdown("""
	### 💡 Demo Description
	This interactive demo illustrates the core concept of our Early Stopping strategy.
	* Left Panel: Shows the model's full Chain-of-Thought (CoT) reasoning process.
	* Right Panel: Shows the reasoning process truncated by our method.

	Key Insight: Models often reach Answer Convergence (the correct answer) well before completing the full reasoning chain. Subsequent steps are often redundant self-verification, which can be safely skipped to reduce inference costs.
	""")

	# Control Section
	with gr.Row():
	case_dropdown = gr.Dropdown(
	choices=list(examples.keys()),
	value="Case 1: Record Sales (GSM8K) - Early Convergence",
	label="Select a Test Case"
	)
	run_btn = gr.Button("▶️ Run Simulation", variant="primary")

	status_bar = gr.Textbox(label="Status", value="Ready to run", interactive=False)

	# Display Section
	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🐢 Original (Full CoT)")
	full_output = gr.Textbox(label="Standard Generation", lines=12)
	with gr.Column():
	gr.Markdown("### 🐇 Our Method (Early Stopping)")
	stop_output = gr.Textbox(label="Early Stopping Generation", lines=12)

	# Event Listener
	run_btn.click(
	fn=simulate_generation,
	inputs=case_dropdown,
	outputs=[gr.Textbox(visible=False), full_output, stop_output, status_bar]
	)

	# Results Footer
	gr.Markdown("""
	---
	### 📊 Key Results (from Paper)

	Our experiments across five benchmarks (including NQ, GSM8K, GPQA) reveal substantial redundancy in standard CoT:

	* NaturalQuestions (NQ): Token reduction of over 40% with improved accuracy using Learn-to-Stop.
	* GSM8K: Token reduction of ~45% with minimal or no accuracy drop.
	* Methods: We propose three strategies: Answer Consistency (Unsupervised), Think Token Adjustment (Unsupervised), and Learn-to-Stop (Supervised).
	""")

	if __name__ == "__main__":
	demo.launch()