Spaces:

st192011
/

Bitnet-Socratic-1-Bit

Running

App Files Files Community

st192011 commited on 4 days ago

Commit

1fc5cde

verified ·

1 Parent(s): 6f7a25d

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -39

app.py CHANGED Viewed

@@ -2,6 +2,13 @@ import gradio as gr
 import subprocess
 import os
 SYSTEM_INSTRUCTION = (
     "You are a Socratic assistant. Do not answer questions directly. "
     "Instead, respond exclusively with 3 deep, reflective questions. "
@@ -9,60 +16,161 @@ SYSTEM_INSTRUCTION = (
 )
 MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
-def clean_socratic_output(raw_stdout, full_prompt):
-    generated_text = raw_stdout.replace(full_prompt, "").strip()
-    stop_markers = ["Stop token", "stop token", "Stop.", "stop.", "Response:", "Response"]
-    lowest_index = len(generated_text)
-    marker_found = False
-    for marker in stop_markers:
-        idx = generated_text.find(marker)
-        if idx != -1 and idx < lowest_index:
-            lowest_index = idx
-            marker_found = True
-    if marker_found:
-        generated_text = generated_text[:lowest_index].strip()
-    if generated_text.endswith(("\n4.", "\n4", "4.", "4")):
-        generated_text = generated_text.rsplit("\n4", 1)[0].strip()
-    return generated_text
-def socratic_chat(user_query):
     if not user_query.strip():
-        return "Please enter a valid question."
     formatted_chat_prompt = f"System: {SYSTEM_INSTRUCTION}\nUser: {user_query}\nAssistant:"
     cmd = [
         "python3", "run_inference.py",
         "-m", MODEL_PATH,
         "-p", formatted_chat_prompt,
-        "-n", "100",
         "-temp", "0.4",
-        "-t", "2"  # Matches HF Space 2-vCPU hardware allocation
     ]
-    execution_result = subprocess.run(cmd, capture_output=True, text=True)
-    if execution_result.stdout:
-        return clean_socratic_output(execution_result.stdout, formatted_chat_prompt)
-    else:
-        return f"Error running model: {execution_result.stderr}"
-# Build the Gradio UI Layout
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🧠 1-Bit Local Socratic Assistant")
-    gr.Markdown("This interface is running an official Microsoft BitNet 1.58-bit model on an optimized CPU backend.")
-    with gr.Row():
-        with gr.Column():
-            input_text = gr.Textbox(label="What concept are you exploring?", placeholder="e.g., What is justice?")
-            submit_btn = gr.Button("Generate Socratic Reflection", variant="primary")
-        with gr.Column():
-            output_text = gr.Textbox(label="Socratic Feedback", lines=6)
-    submit_btn.click(fn=socratic_chat, inputs=input_text, outputs=output_text)
-    input_text.submit(fn=socratic_chat, inputs=input_text, outputs=output_text)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import subprocess
 import os
+# Ensure execution context is inside the compiled architecture directory
+if os.path.exists("/content/BitNet"):
+    os.chdir("/content/BitNet")
+# ==============================================================================
+# CONSTANTS & CONFIGURATION
+# ==============================================================================
 SYSTEM_INSTRUCTION = (
     "You are a Socratic assistant. Do not answer questions directly. "
     "Instead, respond exclusively with 3 deep, reflective questions. "
 )
 MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
+# ==============================================================================
+# STREAMING ENGINE WITH LOOKAHEAD BUFFER
+# ==============================================================================
+def socratic_streaming_chat(user_query):
     if not user_query.strip():
+        yield "Please enter a valid question."
+        return
     formatted_chat_prompt = f"System: {SYSTEM_INSTRUCTION}\nUser: {user_query}\nAssistant:"
     cmd = [
         "python3", "run_inference.py",
         "-m", MODEL_PATH,
         "-p", formatted_chat_prompt,
+        "-n", "120",
         "-temp", "0.4",
+        "-t", "2"  # Optimized for Hugging Face free-tier dual-core CPUs
     ]
+    # Spawn the process with stdout piping enabled for live stream reading
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1
+    )
+    accumulator = ""
+    # Lookahead buffer size in characters (~45 chars safely covers variance of stop loops)
+    LOOKAHEAD_SIZE = 45
+    stop_markers = [
+        "Stop token", "stop token",
+        "Stop.", "stop.",
+        "Response:", "Response",
+        "Assistant:"
+    ]
+    # Read the terminal execution stream live character-by-character
+    while True:
+        char = process.stdout.read(1)
+        if not char:
+            break  # End of stream reached
+        accumulator += char
+        # Discard the echoed prompt wrapper if it slips into the stdout read window
+        if accumulator.startswith(formatted_chat_prompt):
+            accumulator = accumulator.replace(formatted_chat_prompt, "").strip()
+        # Scan the live accumulator for structural collapse boundaries
+        stop_triggered = False
+        for marker in stop_markers:
+            if marker in accumulator:
+                accumulator = accumulator.split(marker)[0]
+                stop_triggered = True
+                break
+        if stop_triggered:
+            process.terminate()  # Hard-kill the engine to stop burning CPU cycles
+            break
+        # UX Lookahead Protection: Only stream text that sits safely outside the trailing boundary window
+        if len(accumulator) > LOOKAHEAD_SIZE:
+            safe_to_display = accumulator[:len(accumulator) - LOOKAHEAD_SIZE]
+            yield safe_to_display.strip()
+    # Yield the completely un-buffered, finalized clean string
+    yield accumulator.strip()
+# ==============================================================================
+# TECHNICAL REPORT MARKDOWN TEXT
+# ==============================================================================
+TECHNICAL_REPORT_MD = """
+## 📋 Project Technical Report: 1-Bit LLM Socratic Refinement Pipeline
+**Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing
+---
+### 1. Executive Objective & Target Dataset
+The goal of this initiative was to engineer a hyper-lightweight, lightning-fast edge computing application capable of engaging users in conversational Socratic exploration. Traditional full-precision models require significant memory overhead to hold nuanced philosophical frameworks. This project focused on building an ultra-compressed conversational experience capable of executing inside a constrained local CPU footprint (e.g., standard consumer laptops or free cloud application tiers).
+* **Target Fine-Tuning Dataset:** `sanjaypantdsd/socratic-method-conversations`
+* **Data Characteristics:** High-quality, clean input-to-output mappings that translate explicit factual questions or structural concepts directly into clusters of exactly three open-ended, deeply analytical questions.
+---
+### 2. Model Training Matrix & Evaluation Phase
+Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs between unquantized fine-tuning weight adjustments and customized binary compilation layers:
+| Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation |
+| :--- | :--- | :--- | :--- |
+| **st192011/bitnet-socratic-1.58b** | Full-precision parameter adjustments tailored to target dataset. | **Excellent** | Produced highly coherent Socratic question arrays aligning perfectly with training structures. |
+| **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. |
+#### Analysis of Quantization Collapse
+The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning, leading to a complete breakdown of language modeling capabilities.
+---
+### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring
+To avoid the quantization bugs of custom fine-tuned weights, we pivoted to an elegant hybrid solution: **combining the official, verified pretrained base weights from Microsoft with precision prompt engineering.**
+We deployed the official `microsoft/bitnet-b1.58-2B-4T-gguf` base model. While this preserved its deep, foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.**
+#### The Stop-Token Anchor Hack
+To enforce structure without re-training the model, we modified the `SYSTEM_INSTRUCTION` block to force the model to declare its own stopping point:
+> *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate stop token"*
+This instruction forces the model's text-prediction engine to anchor itself on a predictable phrase as soon as its linguistic objective is met. Our test iterations confirmed that while the model still experiences trailing token hallucinations (e.g., repeating `Stop. Stop. Stop. Response: 1.`), it prints a recognizable marker *immediately after* providing the high-quality questions.
+---
+### 4. Production Pipeline Architecture
+To deliver a flawless user experience, we implemented a **Programmatic UX Stream Filter** running on the host system. This layer completely isolates the user from any underlying engine instability:
+* **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters of generation inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI.
+* **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`). This prevents the model from wasting CPU cycles on hallucinated loops, maximizing host performance.
+* **Flawless Formatting Output:** The final user interface performs at near real-time speeds on commodity hardware, delivering clean, high-precision Socratic prompts with zero visual clutter.
+"""
+# ==============================================================================
+# GRADIO INTERFACE LAYOUT (TABBED WINDOWS)
+# ==============================================================================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🧠 High-Performance 1-Bit Socratic Workspace")
+    with gr.Tabs():
+        # --- TAB 1: INTERACTIVE APP ---
+        with gr.TabItem("Socratic Assistant"):
+            gr.Markdown("### Real-Time Socratic Exploration")
+            gr.Markdown("Real-time token streaming powered by Microsoft's 1.58-bit BitNet GGUF kernel alongside dynamic programmatic output filtering.")
+            with gr.Row():
+                with gr.Column(scale=4):
+                    input_text = gr.Textbox(
+                        label="Concept Prompt",
+                        placeholder="What concept do you wish to dissect? (e.g., What makes something responsibility?)",
+                        lines=2
+                    )
+                    submit_btn = gr.Button("Dissect Concept via Socratic Dialogue", variant="primary")
+                with gr.Column(scale=5):
+                    output_text = gr.Textbox(
+                        label="Cleaned Real-Time Streaming Output",
+                        lines=8,
+                        interactive=False
+                    )
+            submit_btn.click(fn=socratic_streaming_chat, inputs=input_text, outputs=output_text)
+            input_text.submit(fn=socratic_streaming_chat, inputs=input_text, outputs=output_text)
+        # --- TAB 2: TECHNICAL REPORT ---
+        with gr.TabItem("Technical Report"):
+            gr.Markdown(TECHNICAL_REPORT_MD)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)