Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,22 +9,24 @@ if os.path.exists("/content/BitNet"):
|
|
| 9 |
# ==============================================================================
|
| 10 |
# CONSTANTS & CONFIGURATION
|
| 11 |
# ==============================================================================
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
"You are a Socratic assistant. Do not answer questions directly. "
|
| 14 |
"Instead, respond exclusively with 3 deep, reflective questions. "
|
| 15 |
"Then generate stop token"
|
| 16 |
)
|
| 17 |
-
MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
|
| 18 |
|
| 19 |
# ==============================================================================
|
| 20 |
# STREAMING ENGINE WITH LOOKAHEAD BUFFER
|
| 21 |
# ==============================================================================
|
| 22 |
-
def
|
| 23 |
if not user_query.strip():
|
| 24 |
yield "Please enter a valid question."
|
| 25 |
return
|
| 26 |
|
| 27 |
-
|
|
|
|
| 28 |
|
| 29 |
cmd = [
|
| 30 |
"python3", "run_inference.py",
|
|
@@ -32,13 +34,13 @@ def socratic_streaming_chat(user_query):
|
|
| 32 |
"-p", formatted_chat_prompt,
|
| 33 |
"-n", "120",
|
| 34 |
"-temp", "0.4",
|
| 35 |
-
"-t", "2"
|
| 36 |
]
|
| 37 |
|
| 38 |
process = subprocess.Popen(
|
| 39 |
cmd,
|
| 40 |
stdout=subprocess.PIPE,
|
| 41 |
-
stderr=subprocess.DEVNULL, #
|
| 42 |
text=True,
|
| 43 |
bufsize=1
|
| 44 |
)
|
|
@@ -47,6 +49,7 @@ def socratic_streaming_chat(user_query):
|
|
| 47 |
prompt_cleared = False
|
| 48 |
LOOKAHEAD_SIZE = 45
|
| 49 |
|
|
|
|
| 50 |
stop_markers = [
|
| 51 |
"Stop token", "stop token",
|
| 52 |
"Stop.", "stop.",
|
|
@@ -61,24 +64,24 @@ def socratic_streaming_chat(user_query):
|
|
| 61 |
|
| 62 |
accumulator += char
|
| 63 |
|
| 64 |
-
# ---
|
| 65 |
if not prompt_cleared:
|
| 66 |
if "Assistant:" in accumulator:
|
| 67 |
prompt_cleared = True
|
| 68 |
-
# Delete the prompt and keep only what comes after "Assistant:"
|
| 69 |
accumulator = accumulator.split("Assistant:")[-1].lstrip()
|
| 70 |
-
continue
|
| 71 |
|
| 72 |
-
# Scan for structural
|
| 73 |
stop_triggered = False
|
| 74 |
for marker in stop_markers:
|
| 75 |
if marker in accumulator:
|
|
|
|
| 76 |
accumulator = accumulator.split(marker)[0]
|
| 77 |
stop_triggered = True
|
| 78 |
break
|
| 79 |
|
| 80 |
if stop_triggered:
|
| 81 |
-
process.terminate()
|
| 82 |
break
|
| 83 |
|
| 84 |
# Stream text safely outside the trailing boundary window
|
|
@@ -92,7 +95,7 @@ def socratic_streaming_chat(user_query):
|
|
| 92 |
# TECHNICAL REPORT MARKDOWN TEXT
|
| 93 |
# ==============================================================================
|
| 94 |
TECHNICAL_REPORT_MD = """
|
| 95 |
-
## 📋
|
| 96 |
**Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing
|
| 97 |
|
| 98 |
---
|
|
@@ -106,7 +109,7 @@ The goal of this initiative was to engineer a hyper-lightweight, lightning-fast
|
|
| 106 |
---
|
| 107 |
|
| 108 |
### 2. Model Training Matrix & Evaluation Phase
|
| 109 |
-
Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs
|
| 110 |
|
| 111 |
| Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation |
|
| 112 |
| :--- | :--- | :--- | :--- |
|
|
@@ -114,51 +117,69 @@ Our initial strategy focused on fine-tuning custom models directly on our target
|
|
| 114 |
| **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. |
|
| 115 |
|
| 116 |
#### Analysis of Quantization Collapse
|
| 117 |
-
The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning
|
| 118 |
|
| 119 |
---
|
| 120 |
|
| 121 |
### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring
|
| 122 |
-
To avoid the quantization bugs of custom fine-tuned weights, we pivoted to
|
| 123 |
|
| 124 |
-
We deployed
|
| 125 |
|
| 126 |
#### The Stop-Token Anchor Hack
|
| 127 |
-
To enforce structure
|
| 128 |
> *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate stop token"*
|
| 129 |
|
| 130 |
-
This instruction forces the
|
| 131 |
|
| 132 |
---
|
| 133 |
|
| 134 |
### 4. Production Pipeline Architecture
|
| 135 |
-
To deliver a flawless
|
| 136 |
|
| 137 |
-
* **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters
|
| 138 |
-
* **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`).
|
| 139 |
-
* **Flawless Formatting Output:** The final user interface performs at near real-time speeds on commodity hardware, delivering clean, high-precision Socratic prompts with zero visual clutter.
|
| 140 |
"""
|
| 141 |
|
| 142 |
# ==============================================================================
|
| 143 |
# GRADIO INTERFACE LAYOUT (TABBED WINDOWS)
|
| 144 |
# ==============================================================================
|
| 145 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 146 |
-
gr.Markdown("# 🧠 High-Performance 1-Bit
|
| 147 |
|
| 148 |
with gr.Tabs():
|
| 149 |
# --- TAB 1: INTERACTIVE APP ---
|
| 150 |
-
with gr.TabItem("
|
| 151 |
-
gr.Markdown("### Real-Time
|
| 152 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
with gr.Row():
|
| 155 |
with gr.Column(scale=4):
|
| 156 |
input_text = gr.Textbox(
|
| 157 |
-
label="
|
| 158 |
-
placeholder="
|
| 159 |
lines=2
|
| 160 |
)
|
| 161 |
-
submit_btn = gr.Button("
|
| 162 |
with gr.Column(scale=5):
|
| 163 |
output_text = gr.Textbox(
|
| 164 |
label="Cleaned Real-Time Streaming Output",
|
|
@@ -166,8 +187,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 166 |
interactive=False
|
| 167 |
)
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
|
|
|
| 171 |
|
| 172 |
# --- TAB 2: TECHNICAL REPORT ---
|
| 173 |
with gr.TabItem("Technical Report"):
|
|
|
|
| 9 |
# ==============================================================================
|
| 10 |
# CONSTANTS & CONFIGURATION
|
| 11 |
# ==============================================================================
|
| 12 |
+
MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
|
| 13 |
+
|
| 14 |
+
DEFAULT_SYSTEM_PROMPT = (
|
| 15 |
"You are a Socratic assistant. Do not answer questions directly. "
|
| 16 |
"Instead, respond exclusively with 3 deep, reflective questions. "
|
| 17 |
"Then generate stop token"
|
| 18 |
)
|
|
|
|
| 19 |
|
| 20 |
# ==============================================================================
|
| 21 |
# STREAMING ENGINE WITH LOOKAHEAD BUFFER
|
| 22 |
# ==============================================================================
|
| 23 |
+
def streaming_chat(user_query, system_prompt):
|
| 24 |
if not user_query.strip():
|
| 25 |
yield "Please enter a valid question."
|
| 26 |
return
|
| 27 |
|
| 28 |
+
# Dynamically inject the user's custom system instruction
|
| 29 |
+
formatted_chat_prompt = f"System: {system_prompt}\nUser: {user_query}\nAssistant:"
|
| 30 |
|
| 31 |
cmd = [
|
| 32 |
"python3", "run_inference.py",
|
|
|
|
| 34 |
"-p", formatted_chat_prompt,
|
| 35 |
"-n", "120",
|
| 36 |
"-temp", "0.4",
|
| 37 |
+
"-t", "2" # Optimized for Hugging Face free-tier dual-core CPUs
|
| 38 |
]
|
| 39 |
|
| 40 |
process = subprocess.Popen(
|
| 41 |
cmd,
|
| 42 |
stdout=subprocess.PIPE,
|
| 43 |
+
stderr=subprocess.DEVNULL, # Hide system logs
|
| 44 |
text=True,
|
| 45 |
bufsize=1
|
| 46 |
)
|
|
|
|
| 49 |
prompt_cleared = False
|
| 50 |
LOOKAHEAD_SIZE = 45
|
| 51 |
|
| 52 |
+
# These are the markers our Python function uses to slice the text
|
| 53 |
stop_markers = [
|
| 54 |
"Stop token", "stop token",
|
| 55 |
"Stop.", "stop.",
|
|
|
|
| 64 |
|
| 65 |
accumulator += char
|
| 66 |
|
| 67 |
+
# --- SWALLOW THE ECHOED PROMPT ---
|
| 68 |
if not prompt_cleared:
|
| 69 |
if "Assistant:" in accumulator:
|
| 70 |
prompt_cleared = True
|
|
|
|
| 71 |
accumulator = accumulator.split("Assistant:")[-1].lstrip()
|
| 72 |
+
continue
|
| 73 |
|
| 74 |
+
# --- THE CLEANING FUNCTION: Scan for structural boundaries ---
|
| 75 |
stop_triggered = False
|
| 76 |
for marker in stop_markers:
|
| 77 |
if marker in accumulator:
|
| 78 |
+
# The moment a marker is found, slice the text and trigger the kill switch
|
| 79 |
accumulator = accumulator.split(marker)[0]
|
| 80 |
stop_triggered = True
|
| 81 |
break
|
| 82 |
|
| 83 |
if stop_triggered:
|
| 84 |
+
process.terminate() # Hard-kill the engine
|
| 85 |
break
|
| 86 |
|
| 87 |
# Stream text safely outside the trailing boundary window
|
|
|
|
| 95 |
# TECHNICAL REPORT MARKDOWN TEXT
|
| 96 |
# ==============================================================================
|
| 97 |
TECHNICAL_REPORT_MD = """
|
| 98 |
+
## 📋 Technical Report: 1-Bit LLM Socratic Refinement Pipeline
|
| 99 |
**Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing
|
| 100 |
|
| 101 |
---
|
|
|
|
| 109 |
---
|
| 110 |
|
| 111 |
### 2. Model Training Matrix & Evaluation Phase
|
| 112 |
+
Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs:
|
| 113 |
|
| 114 |
| Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation |
|
| 115 |
| :--- | :--- | :--- | :--- |
|
|
|
|
| 117 |
| **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. |
|
| 118 |
|
| 119 |
#### Analysis of Quantization Collapse
|
| 120 |
+
The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning.
|
| 121 |
|
| 122 |
---
|
| 123 |
|
| 124 |
### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring
|
| 125 |
+
To avoid the quantization bugs of custom fine-tuned weights, we pivoted to a hybrid solution: **combining the official pretrained base weights from Microsoft with precision prompt engineering.**
|
| 126 |
|
| 127 |
+
We deployed `microsoft/bitnet-b1.58-2B-4T-gguf`. While this preserved its foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.**
|
| 128 |
|
| 129 |
#### The Stop-Token Anchor Hack
|
| 130 |
+
To enforce structure, we modified the System Prompt to force the model to declare its own stopping point:
|
| 131 |
> *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate stop token"*
|
| 132 |
|
| 133 |
+
This instruction forces the text-prediction engine to anchor itself on a predictable phrase. While the model still experiences trailing hallucinations, it prints a recognizable marker *immediately after* providing the high-quality questions.
|
| 134 |
|
| 135 |
---
|
| 136 |
|
| 137 |
### 4. Production Pipeline Architecture
|
| 138 |
+
To deliver a flawless UX, we implemented a **Programmatic UX Stream Filter**:
|
| 139 |
|
| 140 |
+
* **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI.
|
| 141 |
+
* **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`).
|
|
|
|
| 142 |
"""
|
| 143 |
|
| 144 |
# ==============================================================================
|
| 145 |
# GRADIO INTERFACE LAYOUT (TABBED WINDOWS)
|
| 146 |
# ==============================================================================
|
| 147 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 148 |
+
gr.Markdown("# 🧠 High-Performance 1-Bit AI Sandbox")
|
| 149 |
|
| 150 |
with gr.Tabs():
|
| 151 |
# --- TAB 1: INTERACTIVE APP ---
|
| 152 |
+
with gr.TabItem("Experimental Interface"):
|
| 153 |
+
gr.Markdown("### Real-Time 1.58-bit Prompting Sandbox")
|
| 154 |
+
gr.Markdown("Test the limits of Microsoft's BitNet GGUF kernel. Change the persona, modify the rules, and see how the Python cleaning function reacts in real-time.")
|
| 155 |
+
|
| 156 |
+
with gr.Row():
|
| 157 |
+
with gr.Column(scale=1):
|
| 158 |
+
gr.Markdown("### 🛠️ The \"Stop Token\" Hack")
|
| 159 |
+
gr.Markdown(
|
| 160 |
+
"**Base models don't know how to stop talking!**\n\n"
|
| 161 |
+
"To prevent infinite loops, our system prompt instructs the model to literally type the words `Stop token` when it is finished. "
|
| 162 |
+
"Our Python backend uses a **Lookahead Buffer** to watch for those words. If it sees them, it instantly slices them out and kills the engine.\n\n"
|
| 163 |
+
"*🧪 Try deleting the words `'Then generate stop token'` from the prompt below and see what happens!*"
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
with gr.Column(scale=2):
|
| 167 |
+
system_prompt_input = gr.Textbox(
|
| 168 |
+
label="System Instruction (Editable)",
|
| 169 |
+
value=DEFAULT_SYSTEM_PROMPT,
|
| 170 |
+
lines=3
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
gr.Markdown("---")
|
| 174 |
|
| 175 |
with gr.Row():
|
| 176 |
with gr.Column(scale=4):
|
| 177 |
input_text = gr.Textbox(
|
| 178 |
+
label="User Query",
|
| 179 |
+
placeholder="e.g., What makes something responsibility?",
|
| 180 |
lines=2
|
| 181 |
)
|
| 182 |
+
submit_btn = gr.Button("Generate Response", variant="primary")
|
| 183 |
with gr.Column(scale=5):
|
| 184 |
output_text = gr.Textbox(
|
| 185 |
label="Cleaned Real-Time Streaming Output",
|
|
|
|
| 187 |
interactive=False
|
| 188 |
)
|
| 189 |
|
| 190 |
+
# Wire up the inputs to include the system prompt
|
| 191 |
+
submit_btn.click(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)
|
| 192 |
+
input_text.submit(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text)
|
| 193 |
|
| 194 |
# --- TAB 2: TECHNICAL REPORT ---
|
| 195 |
with gr.TabItem("Technical Report"):
|