import gradio as gr
import subprocess
import os

# Ensure execution context is inside the compiled architecture directory
if os.path.exists("/content/BitNet"):
    os.chdir("/content/BitNet")

# ==============================================================================
# CONSTANTS & CONFIGURATION
# ==============================================================================
MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"

DEFAULT_SYSTEM_PROMPT = (
    "You are a Socratic assistant. Do not answer questions directly. "
    "Instead, respond exclusively with 3 deep, reflective questions. "
    "Then generate %^%^%^"
)

# ==============================================================================
# STREAMING ENGINE WITH LOOKAHEAD BUFFER
# ==============================================================================
def streaming_chat(user_query, system_prompt):
    if not user_query.strip():
        yield "Please enter a valid question."
        return

    # Dynamically inject the user's custom system instruction
    formatted_chat_prompt = f"System: {system_prompt}\nUser: {user_query}\nAssistant:"
    
    cmd = [
        "python3", "run_inference.py",
        "-m", MODEL_PATH,
        "-p", formatted_chat_prompt,
        "-n", "120",
        "-temp", "0.4",
        "-t", "2"  # Optimized for Hugging Face free-tier dual-core CPUs
    ]
    
    process = subprocess.Popen(
        cmd, 
        stdout=subprocess.PIPE, 
        stderr=subprocess.DEVNULL,  # Hide system logs
        text=True, 
        bufsize=1
    )
    
    accumulator = ""
    prompt_cleared = False
    LOOKAHEAD_SIZE = 45 
    
    # These are the markers our Python function uses to slice the text
    stop_markers = [
        "%^%^%^", 
        "[end of text]",  
        "User:",     
        "Assistant:"
    ]
    
    while True:
        char = process.stdout.read(1)
        if not char:
            break 
            
        accumulator += char
        
        # --- SWALLOW THE ECHOED PROMPT ---
        if not prompt_cleared:
            if "Assistant:" in accumulator:
                prompt_cleared = True
                accumulator = accumulator.split("Assistant:")[-1].lstrip()
            continue  
            
        # --- THE CLEANING FUNCTION: Scan for structural boundaries ---
        stop_triggered = False
        for marker in stop_markers:
            if marker in accumulator:
                # The moment a marker is found, slice the text and trigger the kill switch
                accumulator = accumulator.split(marker)[0]
                stop_triggered = True
                break
                
        if stop_triggered:
            process.terminate()  # Hard-kill the engine
            break
            
        # Stream text safely outside the trailing boundary window
        if len(accumulator) > LOOKAHEAD_SIZE:
            safe_to_display = accumulator[:len(accumulator) - LOOKAHEAD_SIZE]
            yield safe_to_display.strip()
            
    yield accumulator.strip()

# ==============================================================================
# TECHNICAL REPORT MARKDOWN TEXT
# ==============================================================================
TECHNICAL_REPORT_MD = """
## 📋 Technical Report: 1-Bit LLM Socratic Refinement Pipeline
**Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing  

---

### 1. Executive Objective & Target Dataset
The goal of this initiative was to engineer a hyper-lightweight, lightning-fast edge computing application capable of engaging users in conversational Socratic exploration. Traditional full-precision models require significant memory overhead to hold nuanced philosophical frameworks. This project focused on building an ultra-compressed conversational experience capable of executing inside a constrained local CPU footprint (e.g., standard consumer laptops or free cloud application tiers).

* **Target Fine-Tuning Dataset:** `sanjaypantdsd/socratic-method-conversations`
* **Data Characteristics:** High-quality, clean input-to-output mappings that translate explicit factual questions or structural concepts directly into clusters of exactly three open-ended, deeply analytical questions.

---

### 2. Model Training Matrix & Evaluation Phase
Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs:

| Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation |
| :--- | :--- | :--- | :--- |
| **st192011/bitnet-socratic-1.58b** | Full-precision parameter adjustments tailored to target dataset. | **Excellent** | Produced highly coherent Socratic question arrays aligning perfectly with training structures. |
| **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. |

#### Analysis of Quantization Collapse
The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning.

---

### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring
To avoid the quantization bugs of custom fine-tuned weights, we pivoted to a hybrid solution: **combining the official pretrained base weights from Microsoft with precision prompt engineering.**

We deployed `microsoft/bitnet-b1.58-2B-4T-gguf`. While this preserved its foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.**

#### The Stop-Token Anchor Hack
To enforce structure, we modified the System Prompt to force the model to declare its own stopping point:
> *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate %^%^%^"*

This instruction forces the text-prediction engine to anchor itself on a predictable phrase. While the model still experiences trailing hallucinations, it prints a recognizable marker *immediately after* providing the high-quality questions.

---

### 4. Production Pipeline Architecture
To deliver a flawless UX, we implemented a **Programmatic UX Stream Filter**:

* **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI.
* **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`).
"""

# ==============================================================================
# GRADIO INTERFACE LAYOUT (TABBED WINDOWS)
# ==============================================================================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 High-Performance 1-Bit AI Sandbox")
    
    with gr.Tabs():
        # --- TAB 1: INTERACTIVE APP ---
        with gr.TabItem("Experimental Interface"):
            gr.Markdown("### Real-Time 1.58-bit Prompting Sandbox")
            gr.Markdown("Test the limits of Microsoft's BitNet GGUF kernel. Change the persona, modify the rules, and see how the Python cleaning function reacts in real-time.")
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### 🛠️ The \"Stop Token\" Hack")
                    gr.Markdown(
                        "**Base models don't know how to stop talking!**\n\n"
                        "To prevent infinite loops, our system prompt instructs the model to literally type the words `%^%^%^` when it is finished. "
                        "Our Python backend uses a **Lookahead Buffer** to watch for those words. If it sees them, it instantly slices them out and kills the engine.\n\n"
                        "*🧪 Try deleting the words `'Then generate %^%^%^'` from the prompt below and see what happens!*"
                    )
                
                with gr.Column(scale=2):
                    system_prompt_input = gr.Textbox(
                        label="System Instruction (Editable)", 
                        value=DEFAULT_SYSTEM_PROMPT,
                        lines=3
                    )
            
            gr.Markdown("---")
            
            with gr.Row():
                with gr.Column(scale=4):
                    input_text = gr.Textbox(
                        label="User Query", 
                        placeholder="e.g., What makes something responsibility?",
                        lines=2
                    )
                    submit_btn = gr.Button("Generate Response", variant="primary")
                with gr.Column(scale=5):
                    output_text = gr.Textbox(
                        label="Cleaned Real-Time Streaming Output", 
                        lines=8, 
                        interactive=False
                    )
                    
            # Wire up the inputs to include the system prompt
            submit_btn.click(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)
            input_text.submit(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)

        # --- TAB 2: TECHNICAL REPORT ---
        with gr.TabItem("Technical Report"):
            gr.Markdown(TECHNICAL_REPORT_MD)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)