Spaces:

WellGoods
/

VibeThinker

Paused

App Files Files Community

VladBoyko commited on Nov 17, 2025

Commit

1ff1f45

verified ·

1 Parent(s): 3b97453

Update app.py

Browse files

adjusting output formatting and parse

Files changed (1) hide show

app.py +267 -372

app.py CHANGED Viewed

@@ -1,441 +1,336 @@
 import gradio as gr
 import re
 from vllm import LLM, SamplingParams
-import spaces
-import os
-# Force XFormers backend for T4 GPU compatibility (prevent Triton compilation errors)
-os.environ["VLLM_ATTENTION_BACKEND"] = "XFORMERS"
-os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
 class VibeThinkerVLLM:
-    def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
-        self.model_path = model_path
-        print("Loading model with vLLM... This may take a minute.")
-        # T4 GPU compatible - using float16 with XFormers backend
-        self.model = LLM(
-            model=self.model_path,
-            dtype="float16",
-            gpu_memory_utilization=0.85,
-            max_model_len=16384,  # Reduced for T4 stability
-            trust_remote_code=True,
-            enforce_eager=True,  # Disable CUDA graphs to save memory
-            disable_custom_all_reduce=True,  # Prevent Triton compilation issues
-            enable_prefix_caching=False,  # Disable prefix caching (causes Triton issues on T4)
-            max_num_seqs=1,  # Process one sequence at a time for stability
-        )
-        print(f"Model loaded successfully with vLLM!")
-        print(f"Using dtype: float16 with XFormers backend (T4 GPU compatible)")
-    @spaces.GPU
-    def infer_text(self, prompt, temperature=0.6, max_tokens=8192, top_p=0.95):
-        """Generate response with vLLM for faster inference"""
-        # Ensure max_tokens doesn't exceed model capacity
-        max_tokens = min(max_tokens, 16384)
-        messages = [
-            {"role": "user", "content": prompt}
-        ]
-        sampling_params = SamplingParams(
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            top_k=-1,
-        )
-        print(f"Generating with vLLM (temp={temperature}, max_tokens={max_tokens})...")
-        outputs = self.model.chat(messages, sampling_params=sampling_params)
-        response = outputs[0].outputs[0].text
-        return response
 def parse_model_output(text):
-    """Parse model output into structured components"""
-    sections = []
-    # Patterns
-    think_pattern = r'<think>(.*?)</think>'
-    code_pattern = r'```(\w+)?\n(.*?)```'
-    # Extract thinking sections
-    think_matches = list(re.finditer(think_pattern, text, re.DOTALL))
-    last_pos = 0
-    for match in think_matches:
-        # Process text before thinking section
-        before_text = text[last_pos:match.start()].strip()
-        if before_text:
-            sections.extend(parse_text_with_code(before_text))
-        # Add thinking section
-        think_content = match.group(1).strip()
-        sections.append({
-            'type': 'thinking',
-            'content': think_content
-        })
-        last_pos = match.end()
-    # Process remaining text
-    remaining = text[last_pos:].strip()
-    if remaining:
-        sections.extend(parse_text_with_code(remaining))
-    return sections
-def parse_text_with_code(text):
-    """Helper function to parse text containing code blocks"""
-    sections = []
     code_pattern = r'```(\w+)?\n(.*?)```'
-    code_blocks = list(re.finditer(code_pattern, text, re.DOTALL))
-    if not code_blocks:
-        return [{'type': 'text', 'content': text}]
-    text_pos = 0
-    for code_match in code_blocks:
-        # Add text before code
-        pre_code_text = text[text_pos:code_match.start()].strip()
-        if pre_code_text:
-            sections.append({
-                'type': 'text',
-                'content': pre_code_text
-            })
-        # Add code block
-        language = code_match.group(1) or 'python'
-        code_content = code_match.group(2).strip()
-        sections.append({
-            'type': 'code',
-            'language': language,
-            'content': code_content
-        })
-        text_pos = code_match.end()
-    # Add remaining text
-    remaining_text = text[text_pos:].strip()
-    if remaining_text:
-        sections.append({
-            'type': 'text',
-            'content': remaining_text
-        })
-    return sections
-def format_sections_to_html(sections):
     """
-    Convert parsed sections to rich HTML with collapsible elements
-    This approach works reliably with Gradio 5's HTML component
     """
-    html_parts = []
-    # Add JavaScript for interactivity
-    html_parts.append("""
-    <script>
-    function copyCode(elementId) {
-        const codeElement = document.getElementById(elementId);
-        const code = codeElement.textContent;
-        navigator.clipboard.writeText(code).then(() => {
-            // Show temporary success message
-            const btn = event.target;
-            const originalText = btn.textContent;
-            btn.textContent = '✅ Copied!';
-            setTimeout(() => { btn.textContent = originalText; }, 2000);
-        }).catch(err => {
-            console.error('Failed to copy:', err);
-            alert('Failed to copy code');
-        });
-    }
-    function downloadCode(elementId, language) {
-        const codeElement = document.getElementById(elementId);
-        const code = codeElement.textContent;
-        const extensions = {
-            'python': 'py', 'javascript': 'js', 'typescript': 'ts',
-            'html': 'html', 'css': 'css', 'java': 'java',
-            'cpp': 'cpp', 'c': 'c', 'ruby': 'rb',
-            'go': 'go', 'rust': 'rs', 'swift': 'swift',
-            'kotlin': 'kt', 'plaintext': 'txt'
-        };
-        const ext = extensions[language.toLowerCase()] || 'txt';
-        const filename = `code_snippet.${ext}`;
-        const blob = new Blob([code], { type: 'text/plain' });
-        const url = window.URL.createObjectURL(blob);
-        const a = document.createElement('a');
-        a.href = url;
-        a.download = filename;
-        document.body.appendChild(a);
-        a.click();
-        document.body.removeChild(a);
-        window.URL.revokeObjectURL(url);
-    }
-    </script>
-    """)
-    for i, section in enumerate(sections):
-        if section['type'] == 'thinking':
-            # Collapsible thinking section
-            html_parts.append(f"""
-            <details class="thinking-section" style="margin: 15px 0; border: 2px solid #f39c12; border-radius: 8px; background-color: #fff9e6;">
-                <summary style="padding: 12px; cursor: pointer; font-weight: bold; color: #d68910; user-select: none;">
-                    🤔 Thinking Process (Click to expand)
-                </summary>
-                <div style="padding: 15px; border-top: 1px solid #f39c12; background-color: #fffef7; white-space: pre-wrap; font-family: 'Courier New', monospace; font-size: 13px; color: #333; line-height: 1.6; max-height: 500px; overflow-y: auto;">
-{section['content']}
-                </div>
-            </details>
-            """)
-        elif section['type'] == 'code':
-            # Code block with copy/download buttons
-            code_id = f"code-{i}"
-            # Escape HTML in code
-            escaped_code = section['content'].replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
-            html_parts.append(f"""
-            <details class="code-section" open style="margin: 15px 0; border: 2px solid #3498db; border-radius: 8px; background-color: #e8f4fd;">
-                <summary style="padding: 12px; cursor: pointer; font-weight: bold; color: #2874a6; user-select: none;">
-                    💻 Code ({section['language']}) - Click to collapse
-                </summary>
-                <div style="position: relative; padding: 0;">
-                    <div style="position: absolute; top: 10px; right: 10px; z-index: 10;">
-                        <button onclick="copyCode('{code_id}')" style="padding: 6px 12px; margin-right: 5px; background-color: #3498db; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 12px;">
-                            📋 Copy
-                        </button>
-                        <button onclick="downloadCode('{code_id}', '{section['language']}')" style="padding: 6px 12px; background-color: #27ae60; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 12px;">
-                            ⬇️ Download
-                        </button>
-                    </div>
-                    <pre id="{code_id}" style="margin: 0; padding: 40px 15px 15px 15px; background-color: #f8f9fa; border-top: 1px solid #3498db; overflow-x: auto; font-family: 'Courier New', monospace; font-size: 13px; line-height: 1.5;"><code class="language-{section['language']}">{escaped_code}</code></pre>
                 </div>
-            </details>
-            """)
-        else:  # text
-            # Regular text output with markdown-style rendering
-            # Convert markdown to HTML
-            text_html = section['content']
-            # Basic markdown conversions
-            text_html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', text_html)
-            text_html = re.sub(r'\*(.*?)\*', r'<em>\1</em>', text_html)
-            text_html = re.sub(r'`(.*?)`', r'<code style="background-color: #f4f4f4; padding: 2px 5px; border-radius: 3px;">\1</code>', text_html)
-            html_parts.append(f"""
-            <div class="text-section" style="margin: 15px 0; padding: 15px; border: 1px solid #bdc3c7; border-radius: 8px; background-color: #ffffff; white-space: pre-wrap; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; font-size: 14px; line-height: 1.8; color: #2c3e50;">
-{text_html}
             </div>
-            """)
-    return "\n".join(html_parts)
 # Initialize model
-print("Initializing VibeThinker-1.5B with vLLM...")
-model = VibeThinkerVLLM()
-def generate_response(prompt, temperature, max_tokens, top_p):
-    """Generate and return formatted HTML response"""
     if not prompt.strip():
-        return "<div style='color: #e74c3c; padding: 20px; text-align: center;'>⚠️ Please enter a question.</div>"
-    try:
-        # Show generating message
-        yield "<div style='text-align: center; padding: 40px; color: #3498db;'><h3>🤖 Generating response...</h3><p>This may take a moment...</p></div>"
-        # Generate raw response
-        raw_response = model.infer_text(
-            prompt=prompt,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p
-        )
-        # Parse the response
-        sections = parse_model_output(raw_response)
-        # Convert to HTML
-        html_output = format_sections_to_html(sections)
-        yield html_output
-    except Exception as e:
-        error_html = f"""
-        <div style='color: #e74c3c; padding: 20px; border: 2px solid #e74c3c; border-radius: 8px; background-color: #fadbd8; margin: 15px 0;'>
-            <h3>❌ Error</h3>
-            <p><strong>{str(e)}</strong></p>
-            <p>Please try again or adjust the parameters.</p>
-        </div>
-        """
-        yield error_html
-# Custom theme for Gradio 5
-theme = gr.themes.Soft(
-    primary_hue="blue",
-    secondary_hue="purple",
-    neutral_hue="slate",
-    font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
-).set(
-    button_primary_background_fill="*primary_600",
-    button_primary_background_fill_hover="*primary_700",
-    block_label_text_weight="600",
-    block_title_text_weight="700",
-)
-# Gradio 5 UI
 with gr.Blocks(
-    title="VibeThinker-1.5B Advanced",
-    theme=theme,
-    fill_height=False,
 ) as demo:
     gr.Markdown("""
-    # 🧠 VibeThinker-1.5B: Advanced Reasoning Interface
-    **⚡ Powered by vLLM + XFormers** for 10x faster inference on T4 GPU!
-    ### ✨ Features:
-    - 🤔 **Collapsible Thinking Sections** - Explore the model's reasoning process
-    - 💻 **Interactive Code Blocks** - Copy or download code with one click
-    - 📝 **Clean Formatted Output** - Beautiful rendering for all content types
-    **Best for:** Competitive math problems and algorithm coding challenges
-    [GitHub](https://github.com/WeiboAI/VibeThinker) | [Model](https://huggingface.co/WeiboAI/VibeThinker-1.5B) | [Paper](https://huggingface.co/papers/2511.06221)
     """)
     with gr.Row():
         with gr.Column(scale=1):
             prompt_input = gr.Textbox(
-                label="💬 Your Question",
-                placeholder="Ask a math problem or coding challenge (English works best)...",
-                lines=6,
-                max_lines=15
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature_slider = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.5,
                     value=0.6,
                     step=0.1,
-                    label="🌡️ Temperature",
-                    info="0.6 or 1.0 recommended"
                 )
                 max_tokens_slider = gr.Slider(
-                    minimum=512,
-                    maximum=16384,  # Reduced for T4 stability
                     value=8192,
                     step=512,
-                    label="📏 Max Tokens",
-                    info="Model supports up to 16,384 tokens (T4 optimized)"
                 )
-                top_p_slider = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=0.95,
-                    step=0.05,
-                    label="🎯 Top P",
-                    info="Nucleus sampling parameter"
-                )
-            with gr.Row():
-                submit_btn = gr.Button(
-                    "🚀 Generate Solution",
-                    variant="primary",
-                    scale=2
-                )
-                clear_btn = gr.Button(
-                    "🗑️ Clear",
-                    variant="secondary",
-                    scale=1
-                )
-        with gr.Column(scale=1):
-            # Output area using HTML component
-            output_html = gr.HTML(
-                value="""
-                <div style='text-align: center; padding: 60px; color: #7f8c8d;'>
-                    <h3>👋 Ready to solve problems!</h3>
-                    <p>Enter your question and click Generate Solution</p>
-                </div>
-                """
-            )
-    # Example problems
-    gr.Examples(
-        examples=[
-            ["Make me a single page HTML application that takes a color and outputs a color theme", 0.6, 16384, 0.95],
-            ["Solve: Find the number of positive integers n ≤ 1000 such that n^2 + n + 41 is prime.", 0.6, 12288, 0.95],
-            ["Write an efficient Python implementation of the Sieve of Eratosthenes algorithm.", 0.6, 8192, 0.95],
-            ["Prove using mathematical induction that 1 + 2 + 3 + ... + n = n(n+1)/2", 0.6, 8192, 0.95],
-        ],
-        inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
-        label="📚 Example Problems",
-        examples_per_page=4
-    )
-    gr.Markdown("""
-    ---
-    ### 📊 Performance Highlights:
-    | Benchmark | VibeThinker-1.5B | DeepSeek R1 (671B) | Advantage |
-    |-----------|------------------|---------------------|-----------|
-    | **AIME24** | **80.3** ✨ | 79.8 | 400× smaller! |
-    | **AIME25** | **74.4** ✨ | 70.0 | 400× smaller! |
-    | **HMMT25** | **50.4** ✨ | 41.7 | 400× smaller! |
-    | **Training Cost** | **$7,800** | $294,000+ | 40× cheaper! |
-    💡 **Powered by Spectrum-to-Signal Principle (SSP)** training framework
-    """)
-    # Event handlers
-    def clear_interface():
-        return "", """
-        <div style='text-align: center; padding: 60px; color: #7f8c8d;'>
-            <h3>👋 Ready to solve problems!</h3>
-            <p>Enter your question and click Generate Solution</p>
-        </div>
-        """
-    submit_btn.click(
-        fn=generate_response,
-        inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
-        outputs=output_html,
-        show_progress="full"
     )
     clear_btn.click(
-        fn=clear_interface,
-        inputs=[],
         outputs=[prompt_input, output_html]
     )
-# Launch with Gradio 5 optimizations
 if __name__ == "__main__":
-    demo.queue(
-        max_size=20,
-        default_concurrency_limit=10
-    )
-    demo.launch(
-        show_api=True,
-        show_error=True,
-    )

 import gradio as gr
+import os
 import re
 from vllm import LLM, SamplingParams
+# Force XFormers backend for T4 compatibility
+os.environ['VLLM_ATTENTION_BACKEND'] = 'XFORMERS'
+os.environ['VLLM_USE_TRITON_FLASH_ATTN'] = '0'
 class VibeThinkerVLLM:
+    def __init__(self):
+        self.model = None
+        self.load_model()
+    def load_model(self):
+        """Load VibeThinker model with vLLM (T4-compatible settings)"""
+        try:
+            self.model = LLM(
+                model="WeiboAI/VibeThinker-1.5B",
+                dtype="float16",  # Use float16 instead of bfloat16 for T4
+                gpu_memory_utilization=0.85,
+                max_model_len=16384,  # Reduced from 40960 for T4 stability
+                enforce_eager=True,  # Disable CUDA graphs for T4
+                disable_custom_all_reduce=True,  # Avoid custom kernels
+                enable_prefix_caching=False,  # Disable for stability
+                max_num_seqs=1,  # Process one sequence at a time
+                trust_remote_code=True
+            )
+            print("✅ vLLM model loaded successfully with T4-compatible settings")
+        except Exception as e:
+            print(f"❌ Error loading model: {e}")
+            raise
+    def generate_response(self, prompt, temperature=0.6, max_tokens=8192, max_thinking_tokens=4096):
+        """
+        Generate response with thinking length control
+        Args:
+            prompt: Input prompt
+            temperature: Sampling temperature
+            max_tokens: Total max tokens (thinking + output)
+            max_thinking_tokens: Maximum tokens for reasoning phase
+        """
+        if not self.model:
+            return "Model not loaded!", 0, 0
+        try:
+            # Create sampling params with thinking token limit
+            sampling_params = SamplingParams(
+                temperature=temperature,
+                top_p=0.95,
+                top_k=-1,
+                max_tokens=max_tokens,
+                stop=None  # Let model decide when to stop
+            )
+            # Format prompt for competitive coding style
+            formatted_prompt = f"""You are a competitive programming assistant. Solve the following problem efficiently.
+Problem:
+{prompt}
+Think step by step, but be concise. Limit your reasoning to the most important steps (max {max_thinking_tokens} tokens for thinking). Then provide your solution."""
+            # Generate with vLLM
+            outputs = self.model.generate([formatted_prompt], sampling_params)
+            if outputs and len(outputs) > 0:
+                output = outputs[0]
+                generated_text = output.outputs[0].text
+                # Get token counts
+                prompt_tokens = len(output.prompt_token_ids)
+                completion_tokens = len(output.outputs[0].token_ids)
+                return generated_text, prompt_tokens, completion_tokens
+            else:
+                return "No output generated", 0, 0
+        except Exception as e:
+            return f"Error during generation: {str(e)}", 0, 0
 def parse_model_output(text):
+    """
+    Parse model output to separate thinking and final answer
+    Returns: (thinking_content, answer_content, code_blocks)
+    """
+    # Try to find thinking section (common patterns)
+    thinking_patterns = [
+        r'<think>(.*?)</think>',
+        r'<thinking>(.*?)</thinking>',
+        r'(?:Let me think|Let\'s think|Thinking):(.*?)(?=\n\n[A-Z]|\n\nSolution:|\n\nAnswer:|\Z)',
+    ]
+    thinking_content = ""
+    for pattern in thinking_patterns:
+        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
+        if match:
+            thinking_content = match.group(1).strip()
+            # Remove thinking section from text
+            text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
+            break
+    # If no explicit thinking tags, try to detect reasoning section
+    if not thinking_content:
+        lines = text.split('\n')
+        potential_thinking = []
+        for i, line in enumerate(lines):
+            # Stop if we hit solution/code markers
+            if any(marker in line.lower() for marker in ['```', 'solution:', 'answer:', 'final answer', 'boxed{']):
+                break
+            # Collect reasoning-like content
+            if any(word in line.lower() for word in ['step', 'first', 'then', 'next', 'so', 'therefore', 'because']):
+                potential_thinking.append(line)
+        if len(potential_thinking) > 3:  # If substantial reasoning found
+            thinking_content = '\n'.join(potential_thinking)
+            # Remove from main text
+            for line in potential_thinking:
+                text = text.replace(line, '', 1)
+    # Extract code blocks
     code_pattern = r'```(\w+)?\n(.*?)```'
+    code_blocks = re.findall(code_pattern, text, re.DOTALL)
+    # Extract final answer (boxed or explicit)
+    answer_match = re.search(r'\\boxed\{([^}]+)\}', text)
+    if answer_match:
+        answer_content = f"**Final Answer:** {answer_match.group(1)}"
+    else:
+        # Just use remaining text as answer
+        answer_content = text.strip()
+    return thinking_content, answer_content, code_blocks
+def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens):
     """
+    Format output as styled HTML with good contrast and modern design
     """
+    # Calculate total and thinking token estimates
+    total_tokens = prompt_tokens + completion_tokens
+    thinking_tokens_est = len(thinking.split()) * 1.3  # Rough estimate
+    html = f"""
+    <div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; max-width: 100%; margin: 0 auto; background: #ffffff; color: #1a1a1a;">
+        <!-- Token Stats -->
+        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; margin-bottom: 24px; color: white; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
+            <h3 style="margin: 0 0 12px 0; font-size: 18px; font-weight: 600;">📊 Generation Stats</h3>
+            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 12px; font-size: 14px;">
+                <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
+                    <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Prompt Tokens</div>
+                    <div style="font-size: 20px; font-weight: bold;">{prompt_tokens:,}</div>
+                </div>
+                <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
+                    <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Completion Tokens</div>
+                    <div style="font-size: 20px; font-weight: bold;">{completion_tokens:,}</div>
+                </div>
+                <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
+                    <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Est. Thinking Tokens</div>
+                    <div style="font-size: 20px; font-weight: bold;">{int(thinking_tokens_est):,}</div>
+                </div>
+                <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
+                    <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Total Tokens</div>
+                    <div style="font-size: 20px; font-weight: bold;">{total_tokens:,}</div>
+                </div>
+            </div>
+        </div>
+        <!-- Thinking Section (Collapsible) -->
+        {f'''
+        <details style="background: #f8f9fa; border: 2px solid #e9ecef; border-radius: 12px; padding: 20px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
+            <summary style="cursor: pointer; font-weight: 600; font-size: 16px; color: #495057; user-select: none; display: flex; align-items: center; gap: 8px;">
+                <span style="font-size: 20px;">🧠</span>
+                <span>Reasoning Process ({int(thinking_tokens_est):,} tokens)</span>
+                <span style="margin-left: auto; font-size: 12px; color: #6c757d;">Click to expand/collapse</span>
+            </summary>
+            <div style="margin-top: 16px; padding-top: 16px; border-top: 1px solid #dee2e6; color: #212529; line-height: 1.7; white-space: pre-wrap; font-size: 14px; font-family: 'SF Mono', Monaco, Consolas, monospace; background: #ffffff; padding: 16px; border-radius: 8px;">
+{thinking}
+            </div>
+        </details>
+        ''' if thinking else ''}
+        <!-- Answer Section -->
+        <div style="background: #ffffff; border: 2px solid #28a745; border-radius: 12px; padding: 24px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(40,167,69,0.1);">
+            <h3 style="margin: 0 0 16px 0; color: #28a745; font-size: 18px; font-weight: 600; display: flex; align-items: center; gap: 8px;">
+                <span style="font-size: 22px;">✅</span> Final Solution
+            </h3>
+            <div style="color: #212529; line-height: 1.8; font-size: 15px; white-space: pre-wrap;">
+{answer}
+            </div>
+        </div>
+        <!-- Code Blocks -->
+        {f'''
+        <div style="margin-top: 24px;">
+            <h3 style="color: #1a1a1a; font-size: 18px; font-weight: 600; margin-bottom: 16px; display: flex; align-items: center; gap: 8px;">
+                <span style="font-size: 22px;">💻</span> Code
+            </h3>
+            {"".join([f'''
+            <div style="margin-bottom: 16px; background: #1e1e1e; border-radius: 12px; overflow: hidden; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
+                <div style="background: #2d2d2d; padding: 12px 20px; color: #ffffff; font-weight: 600; font-size: 13px; display: flex; justify-content: space-between; align-items: center; border-bottom: 1px solid #3d3d3d;">
+                    <span>{lang if lang else "code"}</span>
+                    <button onclick="navigator.clipboard.writeText(this.parentElement.nextElementSibling.textContent)"
+                            style="background: #4CAF50; color: white; border: none; padding: 6px 14px; border-radius: 6px; cursor: pointer; font-size: 12px; font-weight: 500; transition: background 0.2s;"
+                            onmouseover="this.style.background='#45a049'"
+                            onmouseout="this.style.background='#4CAF50'">
+                        📋 Copy
+                    </button>
                 </div>
+                <pre style="margin: 0; padding: 20px; color: #d4d4d4; overflow-x: auto; font-family: 'SF Mono', Monaco, Consolas, monospace; font-size: 14px; line-height: 1.6;"><code>{code.strip()}</code></pre>
             </div>
+            ''' for lang, code in code_blocks])}
+        </div>
+        ''' if code_blocks else ''}
+    </div>
+    """
+    return html
 # Initialize model
+print("🔄 Initializing VibeThinker with vLLM (T4-optimized)...")
+vibe_model = VibeThinkerVLLM()
+def generate_solution(prompt, temperature=0.6, max_tokens=8192, max_thinking_tokens=4096):
+    """Generate and format solution"""
     if not prompt.strip():
+        return "<p style='color: #dc3545; font-size: 16px; padding: 20px;'>⚠️ Please enter a problem to solve.</p>"
+    # Generate response with thinking token limit
+    response, prompt_tokens, completion_tokens = vibe_model.generate_response(
+        prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        max_thinking_tokens=max_thinking_tokens
+    )
+    # Parse output
+    thinking, answer, code_blocks = parse_model_output(response)
+    # Format as HTML
+    html_output = format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens)
+    return html_output
+# Create Gradio interface
 with gr.Blocks(
+    theme=gr.themes.Soft(
+        primary_hue="indigo",
+        secondary_hue="purple",
+    ),
+    css="""
+    .gradio-container {
+        max-width: 1400px !important;
+    }
+    """
 ) as demo:
     gr.Markdown("""
+    # 🧠 VibeThinker-1.5B Competitive Coding Assistant
+    **Optimized for**: Competitive programming (LeetCode, Codeforces, AtCoder) and algorithm challenges
+    ⚡ **Powered by vLLM** | 🎯 **Best for**: Python algorithmic problems with clear input/output specs
+    ⚠️ **Note**: This model is specialized for competitive programming, not general software development
     """)
     with gr.Row():
         with gr.Column(scale=1):
             prompt_input = gr.Textbox(
+                label="💭 Your Coding Problem",
+                placeholder="Example: Write a Python function to find the longest palindromic substring in a given string. Include test cases.",
+                lines=8
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature_slider = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
                     value=0.6,
                     step=0.1,
+                    label="🌡️ Temperature (0.6 recommended)"
                 )
                 max_tokens_slider = gr.Slider(
+                    minimum=1024,
+                    maximum=16384,
                     value=8192,
+                    step=1024,
+                    label="📝 Max Total Tokens"
+                )
+                max_thinking_slider = gr.Slider(
+                    minimum=512,
+                    maximum=8192,
+                    value=3072,
                     step=512,
+                    label="🧠 Max Thinking Tokens (Lower = faster, less verbose)"
                 )
+                gr.Markdown("""
+                **Tips:**
+                - Lower thinking tokens (1024-2048) for faster, more direct solutions
+                - Higher thinking tokens (4096-8192) for complex problems requiring detailed reasoning
+                - Temperature 0.6 balances creativity and accuracy
+                """)
+            generate_btn = gr.Button("🚀 Generate Solution", variant="primary", size="lg")
+            clear_btn = gr.Button("🗑️ Clear", size="sm")
+        with gr.Column(scale=2):
+            output_html = gr.HTML(label="Solution")
+    # Button actions
+    generate_btn.click(
+        fn=generate_solution,
+        inputs=[prompt_input, temperature_slider, max_tokens_slider, max_thinking_slider],
+        outputs=output_html
     )
     clear_btn.click(
+        fn=lambda: ("", ""),
+        inputs=None,
         outputs=[prompt_input, output_html]
     )
+    # Example problems
+    gr.Examples(
+        examples=[
+            ["Write a function to find the maximum sum of a contiguous subarray (Kadane's Algorithm). Include edge cases and test with array [-2,1,-3,4,-1,2,1,-5,4]"],
+            ["Implement a function to detect if a linked list has a cycle. Explain your approach and provide the solution."],
+            ["Given an array of integers and a target sum, find two numbers that add up to the target. Optimize for time complexity."],
+        ],
+        inputs=prompt_input
+    )
 if __name__ == "__main__":
+    demo.launch()