File size: 9,813 Bytes
6f7a25d
 
 
 
1fc5cde
 
 
 
 
 
 
6f0c92d
 
 
6f7a25d
 
dbdd36b
6f7a25d
 
1fc5cde
 
 
6f0c92d
6f7a25d
1fc5cde
 
 
6f0c92d
 
6f7a25d
 
 
 
 
1fc5cde
6f7a25d
6f0c92d
6f7a25d
 
1fc5cde
 
 
6f0c92d
1fc5cde
 
 
 
 
a6168f7
1fc5cde
 
6f0c92d
1fc5cde
dbdd36b
71165e2
 
1fc5cde
 
6f7a25d
1fc5cde
 
 
a6168f7
1fc5cde
 
 
6f0c92d
a6168f7
 
 
 
6f0c92d
1fc5cde
6f0c92d
1fc5cde
 
 
6f0c92d
1fc5cde
 
 
 
 
6f0c92d
1fc5cde
 
a6168f7
1fc5cde
 
 
 
 
 
 
 
 
 
6f0c92d
1fc5cde
 
 
6f7a25d
1fc5cde
 
 
 
 
 
 
 
 
6f0c92d
1fc5cde
 
 
 
 
 
 
6f0c92d
1fc5cde
 
 
 
6f0c92d
1fc5cde
6f0c92d
1fc5cde
 
6f0c92d
dbdd36b
1fc5cde
6f0c92d
1fc5cde
 
 
 
6f0c92d
1fc5cde
6f0c92d
 
1fc5cde
 
 
 
 
6f7a25d
6f0c92d
6f7a25d
1fc5cde
 
6f0c92d
 
 
 
 
 
 
 
 
dbdd36b
6f0c92d
dbdd36b
6f0c92d
 
 
 
 
 
 
 
 
 
6f7a25d
1fc5cde
 
 
6f0c92d
 
1fc5cde
 
6f0c92d
1fc5cde
 
 
 
 
 
 
6f0c92d
 
 
1fc5cde
 
 
 
6f7a25d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import gradio as gr
import subprocess
import os

# Ensure execution context is inside the compiled architecture directory
if os.path.exists("/content/BitNet"):
    os.chdir("/content/BitNet")

# ==============================================================================
# CONSTANTS & CONFIGURATION
# ==============================================================================
MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"

DEFAULT_SYSTEM_PROMPT = (
    "You are a Socratic assistant. Do not answer questions directly. "
    "Instead, respond exclusively with 3 deep, reflective questions. "
    "Then generate %^%^%^"
)

# ==============================================================================
# STREAMING ENGINE WITH LOOKAHEAD BUFFER
# ==============================================================================
def streaming_chat(user_query, system_prompt):
    if not user_query.strip():
        yield "Please enter a valid question."
        return

    # Dynamically inject the user's custom system instruction
    formatted_chat_prompt = f"System: {system_prompt}\nUser: {user_query}\nAssistant:"
    
    cmd = [
        "python3", "run_inference.py",
        "-m", MODEL_PATH,
        "-p", formatted_chat_prompt,
        "-n", "120",
        "-temp", "0.4",
        "-t", "2"  # Optimized for Hugging Face free-tier dual-core CPUs
    ]
    
    process = subprocess.Popen(
        cmd, 
        stdout=subprocess.PIPE, 
        stderr=subprocess.DEVNULL,  # Hide system logs
        text=True, 
        bufsize=1
    )
    
    accumulator = ""
    prompt_cleared = False
    LOOKAHEAD_SIZE = 45 
    
    # These are the markers our Python function uses to slice the text
    stop_markers = [
        "%^%^%^", 
        "[end of text]",  
        "User:",     
        "Assistant:"
    ]
    
    while True:
        char = process.stdout.read(1)
        if not char:
            break 
            
        accumulator += char
        
        # --- SWALLOW THE ECHOED PROMPT ---
        if not prompt_cleared:
            if "Assistant:" in accumulator:
                prompt_cleared = True
                accumulator = accumulator.split("Assistant:")[-1].lstrip()
            continue  
            
        # --- THE CLEANING FUNCTION: Scan for structural boundaries ---
        stop_triggered = False
        for marker in stop_markers:
            if marker in accumulator:
                # The moment a marker is found, slice the text and trigger the kill switch
                accumulator = accumulator.split(marker)[0]
                stop_triggered = True
                break
                
        if stop_triggered:
            process.terminate()  # Hard-kill the engine
            break
            
        # Stream text safely outside the trailing boundary window
        if len(accumulator) > LOOKAHEAD_SIZE:
            safe_to_display = accumulator[:len(accumulator) - LOOKAHEAD_SIZE]
            yield safe_to_display.strip()
            
    yield accumulator.strip()

# ==============================================================================
# TECHNICAL REPORT MARKDOWN TEXT
# ==============================================================================
TECHNICAL_REPORT_MD = """
## 📋 Technical Report: 1-Bit LLM Socratic Refinement Pipeline
**Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing  

---

### 1. Executive Objective & Target Dataset
The goal of this initiative was to engineer a hyper-lightweight, lightning-fast edge computing application capable of engaging users in conversational Socratic exploration. Traditional full-precision models require significant memory overhead to hold nuanced philosophical frameworks. This project focused on building an ultra-compressed conversational experience capable of executing inside a constrained local CPU footprint (e.g., standard consumer laptops or free cloud application tiers).

* **Target Fine-Tuning Dataset:** `sanjaypantdsd/socratic-method-conversations`
* **Data Characteristics:** High-quality, clean input-to-output mappings that translate explicit factual questions or structural concepts directly into clusters of exactly three open-ended, deeply analytical questions.

---

### 2. Model Training Matrix & Evaluation Phase
Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs:

| Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation |
| :--- | :--- | :--- | :--- |
| **st192011/bitnet-socratic-1.58b** | Full-precision parameter adjustments tailored to target dataset. | **Excellent** | Produced highly coherent Socratic question arrays aligning perfectly with training structures. |
| **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. |

#### Analysis of Quantization Collapse
The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning.

---

### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring
To avoid the quantization bugs of custom fine-tuned weights, we pivoted to a hybrid solution: **combining the official pretrained base weights from Microsoft with precision prompt engineering.**

We deployed `microsoft/bitnet-b1.58-2B-4T-gguf`. While this preserved its foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.**

#### The Stop-Token Anchor Hack
To enforce structure, we modified the System Prompt to force the model to declare its own stopping point:
> *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate %^%^%^"*

This instruction forces the text-prediction engine to anchor itself on a predictable phrase. While the model still experiences trailing hallucinations, it prints a recognizable marker *immediately after* providing the high-quality questions.

---

### 4. Production Pipeline Architecture
To deliver a flawless UX, we implemented a **Programmatic UX Stream Filter**:

* **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI.
* **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`).
"""

# ==============================================================================
# GRADIO INTERFACE LAYOUT (TABBED WINDOWS)
# ==============================================================================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 High-Performance 1-Bit AI Sandbox")
    
    with gr.Tabs():
        # --- TAB 1: INTERACTIVE APP ---
        with gr.TabItem("Experimental Interface"):
            gr.Markdown("### Real-Time 1.58-bit Prompting Sandbox")
            gr.Markdown("Test the limits of Microsoft's BitNet GGUF kernel. Change the persona, modify the rules, and see how the Python cleaning function reacts in real-time.")
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### 🛠️ The \"Stop Token\" Hack")
                    gr.Markdown(
                        "**Base models don't know how to stop talking!**\n\n"
                        "To prevent infinite loops, our system prompt instructs the model to literally type the words `%^%^%^` when it is finished. "
                        "Our Python backend uses a **Lookahead Buffer** to watch for those words. If it sees them, it instantly slices them out and kills the engine.\n\n"
                        "*🧪 Try deleting the words `'Then generate %^%^%^'` from the prompt below and see what happens!*"
                    )
                
                with gr.Column(scale=2):
                    system_prompt_input = gr.Textbox(
                        label="System Instruction (Editable)", 
                        value=DEFAULT_SYSTEM_PROMPT,
                        lines=3
                    )
            
            gr.Markdown("---")
            
            with gr.Row():
                with gr.Column(scale=4):
                    input_text = gr.Textbox(
                        label="User Query", 
                        placeholder="e.g., What makes something responsibility?",
                        lines=2
                    )
                    submit_btn = gr.Button("Generate Response", variant="primary")
                with gr.Column(scale=5):
                    output_text = gr.Textbox(
                        label="Cleaned Real-Time Streaming Output", 
                        lines=8, 
                        interactive=False
                    )
                    
            # Wire up the inputs to include the system prompt
            submit_btn.click(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)
            input_text.submit(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)

        # --- TAB 2: TECHNICAL REPORT ---
        with gr.TabItem("Technical Report"):
            gr.Markdown(TECHNICAL_REPORT_MD)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)