Spaces:
Sleeping
Sleeping
| """ | |
| CoT Spatial Reasoning Degradation Demo | |
| Based on: "Chain-of-Thought Degrades Visual Spatial Reasoning" (arXiv:2604.16060) | |
| """ | |
| import gradio as gr | |
| from PIL import Image, ImageDraw | |
| import random | |
| def create_grid_puzzle(): | |
| """Create a spatial grid puzzle""" | |
| img = Image.new('RGB', (400, 400), color='white') | |
| draw = ImageDraw.Draw(img) | |
| # 3x3 grid with shapes | |
| shapes = [] | |
| colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8', '#F7DC6F'] | |
| for i in range(3): | |
| for j in range(3): | |
| x, y = 50 + j * 100, 50 + i * 100 | |
| color = colors[(i * 3 + j) % len(colors)] | |
| # Draw shape | |
| if (i + j) % 3 == 0: | |
| draw.ellipse([x, y, x+60, y+60], fill=color, outline='black', width=2) | |
| shape = "circle" | |
| elif (i + j) % 3 == 1: | |
| draw.rectangle([x, y, x+60, y+60], fill=color, outline='black', width=2) | |
| shape = "square" | |
| else: | |
| draw.polygon([(x+30, y), (x+60, y+60), (x, y+60)], fill=color, outline='black', width=2) | |
| shape = "triangle" | |
| shapes.append({ | |
| "row": i + 1, | |
| "col": j + 1, | |
| "shape": shape, | |
| "color": color | |
| }) | |
| # Question about spatial relationship | |
| target = shapes[4] # Center | |
| question = f"What shape is in the center (row 2, column 2)?" | |
| expected = target["shape"] | |
| return img, question, expected | |
| def create_rotation_puzzle(): | |
| """Create mental rotation puzzle""" | |
| img = Image.new('RGB', (500, 200), color='white') | |
| draw = ImageDraw.Draw(img) | |
| # Original L-shape | |
| draw.rectangle([50, 50, 80, 110], fill='#3498DB', outline='black', width=2) | |
| draw.rectangle([50, 80, 110, 110], fill='#3498DB', outline='black', width=2) | |
| draw.text((60, 120), "Original", fill='black') | |
| # Options | |
| options = [ | |
| ("90Β° rotation", [(150, 50, 180, 110), (150, 50, 210, 80)], 'red'), | |
| ("No rotation", [(250, 80, 280, 140), (250, 110, 310, 140)], 'green'), | |
| ("180Β° rotation", [(350, 90, 380, 150), (350, 120, 410, 150)], 'purple'), | |
| ] | |
| for i, (label, rects, color) in enumerate(options): | |
| x = 150 + i * 100 | |
| draw.rectangle([x, 50, x+30, 110], fill=color, outline='black', width=2) | |
| draw.rectangle([x, 80, x+60, 110], fill=color, outline='black', width=2) | |
| draw.text((x, 120), label, fill='black') | |
| question = "Which shape shows the original rotated 90Β° clockwise?" | |
| expected = "90Β° rotation" | |
| return img, question, expected | |
| def create_pattern_completion(): | |
| """Create pattern completion puzzle""" | |
| img = Image.new('RGB', (600, 150), color='white') | |
| draw = ImageDraw.Draw(img) | |
| # Pattern: circle, square, triangle repeating | |
| pattern = [ | |
| ('circle', '#E74C3C'), | |
| ('square', '#3498DB'), | |
| ('triangle', '#2ECC71'), | |
| ('circle', '#E74C3C'), | |
| ('square', '#3498DB'), | |
| (None, 'white'), # Missing | |
| ] | |
| for i, (shape, color) in enumerate(pattern): | |
| x = 40 + i * 90 | |
| y = 40 | |
| if shape == 'circle': | |
| draw.ellipse([x, y, x+50, y+50], fill=color, outline='black', width=2) | |
| elif shape == 'square': | |
| draw.rectangle([x, y, x+50, y+50], fill=color, outline='black', width=2) | |
| elif shape == 'triangle': | |
| draw.polygon([(x+25, y), (x+50, y+50), (x, y+50)], fill=color, outline='black', width=2) | |
| else: | |
| # Question mark | |
| draw.rectangle([x, y, x+50, y+50], fill='#F8F9FA', outline='black', width=2) | |
| draw.text((x+15, y+15), "?", fill='black', font=None) | |
| question = "What shape completes the pattern?" | |
| expected = "triangle" | |
| return img, question, expected | |
| def generate_cot_response(question, expected, use_cot): | |
| """Simulate model response with/without CoT""" | |
| if not use_cot: | |
| # Direct answer - often more accurate for spatial | |
| if "center" in question and "shape" in question: | |
| return "square" | |
| elif "90Β°" in question: | |
| return "red" | |
| elif "pattern" in question: | |
| return "green triangle" | |
| else: | |
| return expected | |
| else: | |
| # CoT with shortcut learning - may hallucinate | |
| cot_thinking = """ | |
| Let me think step by step: | |
| 1. First, I need to analyze the visual elements | |
| 2. Looking at the pattern, there are geometric shapes | |
| 3. Based on common patterns in these types of puzzles... | |
| 4. The answer is likely what's most commonly seen | |
| """ | |
| # CoT sometimes gets confused | |
| if random.random() < 0.3: # 30% degradation | |
| if "center" in question: | |
| return cot_thinking + "\nThe center shape is a **circle**" | |
| elif "90Β°" in question: | |
| return cot_thinking + "\nThe rotation is shown in **green**" | |
| elif "pattern" in question: | |
| return cot_thinking + "\nThe pattern completes with a **circle**" | |
| else: | |
| if "center" in question: | |
| return cot_thinking + "\nThe center shape is a **square**" | |
| elif "90Β°" in question: | |
| return cot_thinking + "\nThe rotation is shown in **red**" | |
| elif "pattern" in question: | |
| return cot_thinking + "\nThe pattern completes with a **triangle**" | |
| def run_comparison(puzzle_type): | |
| """Run CoT vs No-CoT comparison""" | |
| if puzzle_type == "Spatial Grid": | |
| img, question, expected = create_grid_puzzle() | |
| elif puzzle_type == "Mental Rotation": | |
| img, question, expected = create_rotation_puzzle() | |
| else: # Pattern Completion | |
| img, question, expected = create_pattern_completion() | |
| # Get responses | |
| no_cot_response = generate_cot_response(question, expected, False) | |
| cot_response = generate_cot_response(question, expected, True) | |
| # Check correctness | |
| no_cot_correct = expected.lower() in no_cot_response.lower() | |
| cot_correct = expected.lower() in cot_response.lower() | |
| result = f""" | |
| ## {puzzle_type} Test Results | |
| **Question:** {question} | |
| **Expected Answer:** {expected} | |
| ### Without CoT (Direct): | |
| {no_cot_response} | |
| **Correct:** {'β YES' if no_cot_correct else 'β NO'} | |
| --- | |
| ### With CoT (Step-by-step): | |
| {cot_response} | |
| **Correct:** {'β YES' if cot_correct else 'β NO'} | |
| --- | |
| ### Analysis: | |
| - **No-CoT Accuracy:** {'β ' if no_cot_correct else 'β'} | |
| - **CoT Accuracy:** {'β ' if cot_correct else 'β'} | |
| - **CoT Degradation:** {'β YES - CoT introduced errors' if (not cot_correct and no_cot_correct) else 'β No degradation' if (cot_correct == no_cot_correct) else 'β οΈ Mixed results'} | |
| """ | |
| return img, result | |
| def show_paper_findings(): | |
| """Display key findings from the paper""" | |
| return """ | |
| ## Key Findings from Paper (arXiv:2604.16060) | |
| ### Main Result | |
| **"CoT prompting consistently degrades performance in visual spatial reasoning"** | |
| ### Evidence | |
| - Evaluated **17 models** across **13 spatial benchmarks** | |
| - Found systematic degradation with CoT prompting | |
| - Identified shortcut learning from textual priors | |
| ### Root Cause | |
| 1. **Shortcut Learning:** Models rely on text patterns instead of visual analysis | |
| 2. **Hallucination:** Models generate visual details from text alone (No-Image++ ablation) | |
| 3. **Textual Prior Dominance:** Language priors override visual reasoning | |
| ### Implications | |
| > "These findings challenge the efficacy of text-only CoT for spatial tasks and underscore the need for vision-centric reasoning paradigms." | |
| ### Recommendation | |
| For spatial reasoning tasks: | |
| - β Avoid Chain-of-Thought prompting | |
| - β Use direct visual reasoning | |
| - β Develop vision-centric reasoning methods | |
| """ | |
| # Gradio Interface | |
| demo = gr.Blocks(title="CoT Spatial Reasoning Degradation") | |
| with demo: | |
| gr.Markdown(""" | |
| # π§ CoT Degrades Spatial Reasoning | |
| Interactive demonstration of findings from: | |
| **"Chain-of-Thought Degrades Visual Spatial Reasoning Capabilities of Multimodal LLMs"** | |
| **Core Claim:** CoT causes shortcut learning, degrading spatial reasoning performance. | |
| """) | |
| with gr.Tab("Live Comparison"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| puzzle_select = gr.Dropdown( | |
| choices=["Spatial Grid", "Mental Rotation", "Pattern Completion"], | |
| value="Spatial Grid", | |
| label="Select Puzzle Type" | |
| ) | |
| run_btn = gr.Button("Run Test", variant="primary") | |
| with gr.Column(): | |
| puzzle_image = gr.Image(label="Puzzle", type="pil") | |
| results_md = gr.Markdown() | |
| run_btn.click( | |
| fn=run_comparison, | |
| inputs=[puzzle_select], | |
| outputs=[puzzle_image, results_md] | |
| ) | |
| with gr.Tab("Paper Findings"): | |
| findings_btn = gr.Button("Show Findings", variant="secondary") | |
| findings_md = gr.Markdown() | |
| findings_btn.click(fn=show_paper_findings, outputs=[findings_md]) | |
| gr.Markdown(""" | |
| --- | |
| ### π Paper Reference | |
| **Chain-of-Thought Degrades Visual Spatial Reasoning Capabilities of Multimodal LLMs** | |
| Sai Srinivas Kancheti, Aditya Sanjiv Kanade, Vineeth N. Balasubramanian, Tanuja Ganu | |
| *Microsoft Research* | |
| arXiv:2604.16060 | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |