st192011's picture
Update app.py
71165e2 verified
import gradio as gr
import subprocess
import os
# Ensure execution context is inside the compiled architecture directory
if os.path.exists("/content/BitNet"):
os.chdir("/content/BitNet")
# ==============================================================================
# CONSTANTS & CONFIGURATION
# ==============================================================================
MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
DEFAULT_SYSTEM_PROMPT = (
"You are a Socratic assistant. Do not answer questions directly. "
"Instead, respond exclusively with 3 deep, reflective questions. "
"Then generate %^%^%^"
)
# ==============================================================================
# STREAMING ENGINE WITH LOOKAHEAD BUFFER
# ==============================================================================
def streaming_chat(user_query, system_prompt):
if not user_query.strip():
yield "Please enter a valid question."
return
# Dynamically inject the user's custom system instruction
formatted_chat_prompt = f"System: {system_prompt}\nUser: {user_query}\nAssistant:"
cmd = [
"python3", "run_inference.py",
"-m", MODEL_PATH,
"-p", formatted_chat_prompt,
"-n", "120",
"-temp", "0.4",
"-t", "2" # Optimized for Hugging Face free-tier dual-core CPUs
]
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL, # Hide system logs
text=True,
bufsize=1
)
accumulator = ""
prompt_cleared = False
LOOKAHEAD_SIZE = 45
# These are the markers our Python function uses to slice the text
stop_markers = [
"%^%^%^",
"[end of text]",
"User:",
"Assistant:"
]
while True:
char = process.stdout.read(1)
if not char:
break
accumulator += char
# --- SWALLOW THE ECHOED PROMPT ---
if not prompt_cleared:
if "Assistant:" in accumulator:
prompt_cleared = True
accumulator = accumulator.split("Assistant:")[-1].lstrip()
continue
# --- THE CLEANING FUNCTION: Scan for structural boundaries ---
stop_triggered = False
for marker in stop_markers:
if marker in accumulator:
# The moment a marker is found, slice the text and trigger the kill switch
accumulator = accumulator.split(marker)[0]
stop_triggered = True
break
if stop_triggered:
process.terminate() # Hard-kill the engine
break
# Stream text safely outside the trailing boundary window
if len(accumulator) > LOOKAHEAD_SIZE:
safe_to_display = accumulator[:len(accumulator) - LOOKAHEAD_SIZE]
yield safe_to_display.strip()
yield accumulator.strip()
# ==============================================================================
# TECHNICAL REPORT MARKDOWN TEXT
# ==============================================================================
TECHNICAL_REPORT_MD = """
## 📋 Technical Report: 1-Bit LLM Socratic Refinement Pipeline
**Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing
---
### 1. Executive Objective & Target Dataset
The goal of this initiative was to engineer a hyper-lightweight, lightning-fast edge computing application capable of engaging users in conversational Socratic exploration. Traditional full-precision models require significant memory overhead to hold nuanced philosophical frameworks. This project focused on building an ultra-compressed conversational experience capable of executing inside a constrained local CPU footprint (e.g., standard consumer laptops or free cloud application tiers).
* **Target Fine-Tuning Dataset:** `sanjaypantdsd/socratic-method-conversations`
* **Data Characteristics:** High-quality, clean input-to-output mappings that translate explicit factual questions or structural concepts directly into clusters of exactly three open-ended, deeply analytical questions.
---
### 2. Model Training Matrix & Evaluation Phase
Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs:
| Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation |
| :--- | :--- | :--- | :--- |
| **st192011/bitnet-socratic-1.58b** | Full-precision parameter adjustments tailored to target dataset. | **Excellent** | Produced highly coherent Socratic question arrays aligning perfectly with training structures. |
| **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. |
#### Analysis of Quantization Collapse
The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning.
---
### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring
To avoid the quantization bugs of custom fine-tuned weights, we pivoted to a hybrid solution: **combining the official pretrained base weights from Microsoft with precision prompt engineering.**
We deployed `microsoft/bitnet-b1.58-2B-4T-gguf`. While this preserved its foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.**
#### The Stop-Token Anchor Hack
To enforce structure, we modified the System Prompt to force the model to declare its own stopping point:
> *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate %^%^%^"*
This instruction forces the text-prediction engine to anchor itself on a predictable phrase. While the model still experiences trailing hallucinations, it prints a recognizable marker *immediately after* providing the high-quality questions.
---
### 4. Production Pipeline Architecture
To deliver a flawless UX, we implemented a **Programmatic UX Stream Filter**:
* **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI.
* **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`).
"""
# ==============================================================================
# GRADIO INTERFACE LAYOUT (TABBED WINDOWS)
# ==============================================================================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 High-Performance 1-Bit AI Sandbox")
with gr.Tabs():
# --- TAB 1: INTERACTIVE APP ---
with gr.TabItem("Experimental Interface"):
gr.Markdown("### Real-Time 1.58-bit Prompting Sandbox")
gr.Markdown("Test the limits of Microsoft's BitNet GGUF kernel. Change the persona, modify the rules, and see how the Python cleaning function reacts in real-time.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🛠️ The \"Stop Token\" Hack")
gr.Markdown(
"**Base models don't know how to stop talking!**\n\n"
"To prevent infinite loops, our system prompt instructs the model to literally type the words `%^%^%^` when it is finished. "
"Our Python backend uses a **Lookahead Buffer** to watch for those words. If it sees them, it instantly slices them out and kills the engine.\n\n"
"*🧪 Try deleting the words `'Then generate %^%^%^'` from the prompt below and see what happens!*"
)
with gr.Column(scale=2):
system_prompt_input = gr.Textbox(
label="System Instruction (Editable)",
value=DEFAULT_SYSTEM_PROMPT,
lines=3
)
gr.Markdown("---")
with gr.Row():
with gr.Column(scale=4):
input_text = gr.Textbox(
label="User Query",
placeholder="e.g., What makes something responsibility?",
lines=2
)
submit_btn = gr.Button("Generate Response", variant="primary")
with gr.Column(scale=5):
output_text = gr.Textbox(
label="Cleaned Real-Time Streaming Output",
lines=8,
interactive=False
)
# Wire up the inputs to include the system prompt
submit_btn.click(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)
input_text.submit(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)
# --- TAB 2: TECHNICAL REPORT ---
with gr.TabItem("Technical Report"):
gr.Markdown(TECHNICAL_REPORT_MD)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)