Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,6 +2,13 @@ import gradio as gr
|
|
| 2 |
import subprocess
|
| 3 |
import os
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
SYSTEM_INSTRUCTION = (
|
| 6 |
"You are a Socratic assistant. Do not answer questions directly. "
|
| 7 |
"Instead, respond exclusively with 3 deep, reflective questions. "
|
|
@@ -9,60 +16,161 @@ SYSTEM_INSTRUCTION = (
|
|
| 9 |
)
|
| 10 |
MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
lowest_index = len(generated_text)
|
| 17 |
-
marker_found = False
|
| 18 |
-
for marker in stop_markers:
|
| 19 |
-
idx = generated_text.find(marker)
|
| 20 |
-
if idx != -1 and idx < lowest_index:
|
| 21 |
-
lowest_index = idx
|
| 22 |
-
marker_found = True
|
| 23 |
-
|
| 24 |
-
if marker_found:
|
| 25 |
-
generated_text = generated_text[:lowest_index].strip()
|
| 26 |
-
if generated_text.endswith(("\n4.", "\n4", "4.", "4")):
|
| 27 |
-
generated_text = generated_text.rsplit("\n4", 1)[0].strip()
|
| 28 |
-
return generated_text
|
| 29 |
-
|
| 30 |
-
def socratic_chat(user_query):
|
| 31 |
if not user_query.strip():
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 34 |
formatted_chat_prompt = f"System: {SYSTEM_INSTRUCTION}\nUser: {user_query}\nAssistant:"
|
| 35 |
|
| 36 |
cmd = [
|
| 37 |
"python3", "run_inference.py",
|
| 38 |
"-m", MODEL_PATH,
|
| 39 |
"-p", formatted_chat_prompt,
|
| 40 |
-
"-n", "
|
| 41 |
"-temp", "0.4",
|
| 42 |
-
"-t", "2" #
|
| 43 |
]
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 54 |
-
gr.Markdown("# 🧠 1-Bit
|
| 55 |
-
gr.Markdown("This interface is running an official Microsoft BitNet 1.58-bit model on an optimized CPU backend.")
|
| 56 |
|
| 57 |
-
with gr.
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
output_text = gr.Textbox(label="Socratic Feedback", lines=6)
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
if __name__ == "__main__":
|
| 68 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 2 |
import subprocess
|
| 3 |
import os
|
| 4 |
|
| 5 |
+
# Ensure execution context is inside the compiled architecture directory
|
| 6 |
+
if os.path.exists("/content/BitNet"):
|
| 7 |
+
os.chdir("/content/BitNet")
|
| 8 |
+
|
| 9 |
+
# ==============================================================================
|
| 10 |
+
# CONSTANTS & CONFIGURATION
|
| 11 |
+
# ==============================================================================
|
| 12 |
SYSTEM_INSTRUCTION = (
|
| 13 |
"You are a Socratic assistant. Do not answer questions directly. "
|
| 14 |
"Instead, respond exclusively with 3 deep, reflective questions. "
|
|
|
|
| 16 |
)
|
| 17 |
MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
|
| 18 |
|
| 19 |
+
# ==============================================================================
|
| 20 |
+
# STREAMING ENGINE WITH LOOKAHEAD BUFFER
|
| 21 |
+
# ==============================================================================
|
| 22 |
+
def socratic_streaming_chat(user_query):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
if not user_query.strip():
|
| 24 |
+
yield "Please enter a valid question."
|
| 25 |
+
return
|
| 26 |
+
|
| 27 |
formatted_chat_prompt = f"System: {SYSTEM_INSTRUCTION}\nUser: {user_query}\nAssistant:"
|
| 28 |
|
| 29 |
cmd = [
|
| 30 |
"python3", "run_inference.py",
|
| 31 |
"-m", MODEL_PATH,
|
| 32 |
"-p", formatted_chat_prompt,
|
| 33 |
+
"-n", "120",
|
| 34 |
"-temp", "0.4",
|
| 35 |
+
"-t", "2" # Optimized for Hugging Face free-tier dual-core CPUs
|
| 36 |
]
|
| 37 |
|
| 38 |
+
# Spawn the process with stdout piping enabled for live stream reading
|
| 39 |
+
process = subprocess.Popen(
|
| 40 |
+
cmd,
|
| 41 |
+
stdout=subprocess.PIPE,
|
| 42 |
+
stderr=subprocess.STDOUT,
|
| 43 |
+
text=True,
|
| 44 |
+
bufsize=1
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
accumulator = ""
|
| 48 |
+
# Lookahead buffer size in characters (~45 chars safely covers variance of stop loops)
|
| 49 |
+
LOOKAHEAD_SIZE = 45
|
| 50 |
+
|
| 51 |
+
stop_markers = [
|
| 52 |
+
"Stop token", "stop token",
|
| 53 |
+
"Stop.", "stop.",
|
| 54 |
+
"Response:", "Response",
|
| 55 |
+
"Assistant:"
|
| 56 |
+
]
|
| 57 |
|
| 58 |
+
# Read the terminal execution stream live character-by-character
|
| 59 |
+
while True:
|
| 60 |
+
char = process.stdout.read(1)
|
| 61 |
+
if not char:
|
| 62 |
+
break # End of stream reached
|
| 63 |
+
|
| 64 |
+
accumulator += char
|
| 65 |
+
|
| 66 |
+
# Discard the echoed prompt wrapper if it slips into the stdout read window
|
| 67 |
+
if accumulator.startswith(formatted_chat_prompt):
|
| 68 |
+
accumulator = accumulator.replace(formatted_chat_prompt, "").strip()
|
| 69 |
+
|
| 70 |
+
# Scan the live accumulator for structural collapse boundaries
|
| 71 |
+
stop_triggered = False
|
| 72 |
+
for marker in stop_markers:
|
| 73 |
+
if marker in accumulator:
|
| 74 |
+
accumulator = accumulator.split(marker)[0]
|
| 75 |
+
stop_triggered = True
|
| 76 |
+
break
|
| 77 |
+
|
| 78 |
+
if stop_triggered:
|
| 79 |
+
process.terminate() # Hard-kill the engine to stop burning CPU cycles
|
| 80 |
+
break
|
| 81 |
+
|
| 82 |
+
# UX Lookahead Protection: Only stream text that sits safely outside the trailing boundary window
|
| 83 |
+
if len(accumulator) > LOOKAHEAD_SIZE:
|
| 84 |
+
safe_to_display = accumulator[:len(accumulator) - LOOKAHEAD_SIZE]
|
| 85 |
+
yield safe_to_display.strip()
|
| 86 |
+
|
| 87 |
+
# Yield the completely un-buffered, finalized clean string
|
| 88 |
+
yield accumulator.strip()
|
| 89 |
+
|
| 90 |
+
# ==============================================================================
|
| 91 |
+
# TECHNICAL REPORT MARKDOWN TEXT
|
| 92 |
+
# ==============================================================================
|
| 93 |
+
TECHNICAL_REPORT_MD = """
|
| 94 |
+
## 📋 Project Technical Report: 1-Bit LLM Socratic Refinement Pipeline
|
| 95 |
+
**Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
|
| 99 |
+
### 1. Executive Objective & Target Dataset
|
| 100 |
+
The goal of this initiative was to engineer a hyper-lightweight, lightning-fast edge computing application capable of engaging users in conversational Socratic exploration. Traditional full-precision models require significant memory overhead to hold nuanced philosophical frameworks. This project focused on building an ultra-compressed conversational experience capable of executing inside a constrained local CPU footprint (e.g., standard consumer laptops or free cloud application tiers).
|
| 101 |
+
|
| 102 |
+
* **Target Fine-Tuning Dataset:** `sanjaypantdsd/socratic-method-conversations`
|
| 103 |
+
* **Data Characteristics:** High-quality, clean input-to-output mappings that translate explicit factual questions or structural concepts directly into clusters of exactly three open-ended, deeply analytical questions.
|
| 104 |
+
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
### 2. Model Training Matrix & Evaluation Phase
|
| 108 |
+
Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs between unquantized fine-tuning weight adjustments and customized binary compilation layers:
|
| 109 |
+
|
| 110 |
+
| Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation |
|
| 111 |
+
| :--- | :--- | :--- | :--- |
|
| 112 |
+
| **st192011/bitnet-socratic-1.58b** | Full-precision parameter adjustments tailored to target dataset. | **Excellent** | Produced highly coherent Socratic question arrays aligning perfectly with training structures. |
|
| 113 |
+
| **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. |
|
| 114 |
+
|
| 115 |
+
#### Analysis of Quantization Collapse
|
| 116 |
+
The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning, leading to a complete breakdown of language modeling capabilities.
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring
|
| 121 |
+
To avoid the quantization bugs of custom fine-tuned weights, we pivoted to an elegant hybrid solution: **combining the official, verified pretrained base weights from Microsoft with precision prompt engineering.**
|
| 122 |
+
|
| 123 |
+
We deployed the official `microsoft/bitnet-b1.58-2B-4T-gguf` base model. While this preserved its deep, foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.**
|
| 124 |
+
|
| 125 |
+
#### The Stop-Token Anchor Hack
|
| 126 |
+
To enforce structure without re-training the model, we modified the `SYSTEM_INSTRUCTION` block to force the model to declare its own stopping point:
|
| 127 |
+
> *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate stop token"*
|
| 128 |
+
|
| 129 |
+
This instruction forces the model's text-prediction engine to anchor itself on a predictable phrase as soon as its linguistic objective is met. Our test iterations confirmed that while the model still experiences trailing token hallucinations (e.g., repeating `Stop. Stop. Stop. Response: 1.`), it prints a recognizable marker *immediately after* providing the high-quality questions.
|
| 130 |
+
|
| 131 |
+
---
|
| 132 |
+
|
| 133 |
+
### 4. Production Pipeline Architecture
|
| 134 |
+
To deliver a flawless user experience, we implemented a **Programmatic UX Stream Filter** running on the host system. This layer completely isolates the user from any underlying engine instability:
|
| 135 |
+
|
| 136 |
+
* **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters of generation inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI.
|
| 137 |
+
* **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`). This prevents the model from wasting CPU cycles on hallucinated loops, maximizing host performance.
|
| 138 |
+
* **Flawless Formatting Output:** The final user interface performs at near real-time speeds on commodity hardware, delivering clean, high-precision Socratic prompts with zero visual clutter.
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
# ==============================================================================
|
| 142 |
+
# GRADIO INTERFACE LAYOUT (TABBED WINDOWS)
|
| 143 |
+
# ==============================================================================
|
| 144 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 145 |
+
gr.Markdown("# 🧠 High-Performance 1-Bit Socratic Workspace")
|
|
|
|
| 146 |
|
| 147 |
+
with gr.Tabs():
|
| 148 |
+
# --- TAB 1: INTERACTIVE APP ---
|
| 149 |
+
with gr.TabItem("Socratic Assistant"):
|
| 150 |
+
gr.Markdown("### Real-Time Socratic Exploration")
|
| 151 |
+
gr.Markdown("Real-time token streaming powered by Microsoft's 1.58-bit BitNet GGUF kernel alongside dynamic programmatic output filtering.")
|
|
|
|
| 152 |
|
| 153 |
+
with gr.Row():
|
| 154 |
+
with gr.Column(scale=4):
|
| 155 |
+
input_text = gr.Textbox(
|
| 156 |
+
label="Concept Prompt",
|
| 157 |
+
placeholder="What concept do you wish to dissect? (e.g., What makes something responsibility?)",
|
| 158 |
+
lines=2
|
| 159 |
+
)
|
| 160 |
+
submit_btn = gr.Button("Dissect Concept via Socratic Dialogue", variant="primary")
|
| 161 |
+
with gr.Column(scale=5):
|
| 162 |
+
output_text = gr.Textbox(
|
| 163 |
+
label="Cleaned Real-Time Streaming Output",
|
| 164 |
+
lines=8,
|
| 165 |
+
interactive=False
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
submit_btn.click(fn=socratic_streaming_chat, inputs=input_text, outputs=output_text)
|
| 169 |
+
input_text.submit(fn=socratic_streaming_chat, inputs=input_text, outputs=output_text)
|
| 170 |
+
|
| 171 |
+
# --- TAB 2: TECHNICAL REPORT ---
|
| 172 |
+
with gr.TabItem("Technical Report"):
|
| 173 |
+
gr.Markdown(TECHNICAL_REPORT_MD)
|
| 174 |
|
| 175 |
if __name__ == "__main__":
|
| 176 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|