Spaces:
Running
Running
File size: 9,813 Bytes
6f7a25d 1fc5cde 6f0c92d 6f7a25d dbdd36b 6f7a25d 1fc5cde 6f0c92d 6f7a25d 1fc5cde 6f0c92d 6f7a25d 1fc5cde 6f7a25d 6f0c92d 6f7a25d 1fc5cde 6f0c92d 1fc5cde a6168f7 1fc5cde 6f0c92d 1fc5cde dbdd36b 71165e2 1fc5cde 6f7a25d 1fc5cde a6168f7 1fc5cde 6f0c92d a6168f7 6f0c92d 1fc5cde 6f0c92d 1fc5cde 6f0c92d 1fc5cde 6f0c92d 1fc5cde a6168f7 1fc5cde 6f0c92d 1fc5cde 6f7a25d 1fc5cde 6f0c92d 1fc5cde 6f0c92d 1fc5cde 6f0c92d 1fc5cde 6f0c92d 1fc5cde 6f0c92d dbdd36b 1fc5cde 6f0c92d 1fc5cde 6f0c92d 1fc5cde 6f0c92d 1fc5cde 6f7a25d 6f0c92d 6f7a25d 1fc5cde 6f0c92d dbdd36b 6f0c92d dbdd36b 6f0c92d 6f7a25d 1fc5cde 6f0c92d 1fc5cde 6f0c92d 1fc5cde 6f0c92d 1fc5cde 6f7a25d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | import gradio as gr
import subprocess
import os
# Ensure execution context is inside the compiled architecture directory
if os.path.exists("/content/BitNet"):
os.chdir("/content/BitNet")
# ==============================================================================
# CONSTANTS & CONFIGURATION
# ==============================================================================
MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
DEFAULT_SYSTEM_PROMPT = (
"You are a Socratic assistant. Do not answer questions directly. "
"Instead, respond exclusively with 3 deep, reflective questions. "
"Then generate %^%^%^"
)
# ==============================================================================
# STREAMING ENGINE WITH LOOKAHEAD BUFFER
# ==============================================================================
def streaming_chat(user_query, system_prompt):
if not user_query.strip():
yield "Please enter a valid question."
return
# Dynamically inject the user's custom system instruction
formatted_chat_prompt = f"System: {system_prompt}\nUser: {user_query}\nAssistant:"
cmd = [
"python3", "run_inference.py",
"-m", MODEL_PATH,
"-p", formatted_chat_prompt,
"-n", "120",
"-temp", "0.4",
"-t", "2" # Optimized for Hugging Face free-tier dual-core CPUs
]
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL, # Hide system logs
text=True,
bufsize=1
)
accumulator = ""
prompt_cleared = False
LOOKAHEAD_SIZE = 45
# These are the markers our Python function uses to slice the text
stop_markers = [
"%^%^%^",
"[end of text]",
"User:",
"Assistant:"
]
while True:
char = process.stdout.read(1)
if not char:
break
accumulator += char
# --- SWALLOW THE ECHOED PROMPT ---
if not prompt_cleared:
if "Assistant:" in accumulator:
prompt_cleared = True
accumulator = accumulator.split("Assistant:")[-1].lstrip()
continue
# --- THE CLEANING FUNCTION: Scan for structural boundaries ---
stop_triggered = False
for marker in stop_markers:
if marker in accumulator:
# The moment a marker is found, slice the text and trigger the kill switch
accumulator = accumulator.split(marker)[0]
stop_triggered = True
break
if stop_triggered:
process.terminate() # Hard-kill the engine
break
# Stream text safely outside the trailing boundary window
if len(accumulator) > LOOKAHEAD_SIZE:
safe_to_display = accumulator[:len(accumulator) - LOOKAHEAD_SIZE]
yield safe_to_display.strip()
yield accumulator.strip()
# ==============================================================================
# TECHNICAL REPORT MARKDOWN TEXT
# ==============================================================================
TECHNICAL_REPORT_MD = """
## 📋 Technical Report: 1-Bit LLM Socratic Refinement Pipeline
**Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing
---
### 1. Executive Objective & Target Dataset
The goal of this initiative was to engineer a hyper-lightweight, lightning-fast edge computing application capable of engaging users in conversational Socratic exploration. Traditional full-precision models require significant memory overhead to hold nuanced philosophical frameworks. This project focused on building an ultra-compressed conversational experience capable of executing inside a constrained local CPU footprint (e.g., standard consumer laptops or free cloud application tiers).
* **Target Fine-Tuning Dataset:** `sanjaypantdsd/socratic-method-conversations`
* **Data Characteristics:** High-quality, clean input-to-output mappings that translate explicit factual questions or structural concepts directly into clusters of exactly three open-ended, deeply analytical questions.
---
### 2. Model Training Matrix & Evaluation Phase
Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs:
| Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation |
| :--- | :--- | :--- | :--- |
| **st192011/bitnet-socratic-1.58b** | Full-precision parameter adjustments tailored to target dataset. | **Excellent** | Produced highly coherent Socratic question arrays aligning perfectly with training structures. |
| **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. |
#### Analysis of Quantization Collapse
The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning.
---
### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring
To avoid the quantization bugs of custom fine-tuned weights, we pivoted to a hybrid solution: **combining the official pretrained base weights from Microsoft with precision prompt engineering.**
We deployed `microsoft/bitnet-b1.58-2B-4T-gguf`. While this preserved its foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.**
#### The Stop-Token Anchor Hack
To enforce structure, we modified the System Prompt to force the model to declare its own stopping point:
> *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate %^%^%^"*
This instruction forces the text-prediction engine to anchor itself on a predictable phrase. While the model still experiences trailing hallucinations, it prints a recognizable marker *immediately after* providing the high-quality questions.
---
### 4. Production Pipeline Architecture
To deliver a flawless UX, we implemented a **Programmatic UX Stream Filter**:
* **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI.
* **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`).
"""
# ==============================================================================
# GRADIO INTERFACE LAYOUT (TABBED WINDOWS)
# ==============================================================================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 High-Performance 1-Bit AI Sandbox")
with gr.Tabs():
# --- TAB 1: INTERACTIVE APP ---
with gr.TabItem("Experimental Interface"):
gr.Markdown("### Real-Time 1.58-bit Prompting Sandbox")
gr.Markdown("Test the limits of Microsoft's BitNet GGUF kernel. Change the persona, modify the rules, and see how the Python cleaning function reacts in real-time.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🛠️ The \"Stop Token\" Hack")
gr.Markdown(
"**Base models don't know how to stop talking!**\n\n"
"To prevent infinite loops, our system prompt instructs the model to literally type the words `%^%^%^` when it is finished. "
"Our Python backend uses a **Lookahead Buffer** to watch for those words. If it sees them, it instantly slices them out and kills the engine.\n\n"
"*🧪 Try deleting the words `'Then generate %^%^%^'` from the prompt below and see what happens!*"
)
with gr.Column(scale=2):
system_prompt_input = gr.Textbox(
label="System Instruction (Editable)",
value=DEFAULT_SYSTEM_PROMPT,
lines=3
)
gr.Markdown("---")
with gr.Row():
with gr.Column(scale=4):
input_text = gr.Textbox(
label="User Query",
placeholder="e.g., What makes something responsibility?",
lines=2
)
submit_btn = gr.Button("Generate Response", variant="primary")
with gr.Column(scale=5):
output_text = gr.Textbox(
label="Cleaned Real-Time Streaming Output",
lines=8,
interactive=False
)
# Wire up the inputs to include the system prompt
submit_btn.click(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)
input_text.submit(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)
# --- TAB 2: TECHNICAL REPORT ---
with gr.TabItem("Technical Report"):
gr.Markdown(TECHNICAL_REPORT_MD)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |