Spaces:
Running
Running
| import gradio as gr | |
| import subprocess | |
| import os | |
| # Ensure execution context is inside the compiled architecture directory | |
| if os.path.exists("/content/BitNet"): | |
| os.chdir("/content/BitNet") | |
| # ============================================================================== | |
| # CONSTANTS & CONFIGURATION | |
| # ============================================================================== | |
| MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf" | |
| DEFAULT_SYSTEM_PROMPT = ( | |
| "You are a Socratic assistant. Do not answer questions directly. " | |
| "Instead, respond exclusively with 3 deep, reflective questions. " | |
| "Then generate %^%^%^" | |
| ) | |
| # ============================================================================== | |
| # STREAMING ENGINE WITH LOOKAHEAD BUFFER | |
| # ============================================================================== | |
| def streaming_chat(user_query, system_prompt): | |
| if not user_query.strip(): | |
| yield "Please enter a valid question." | |
| return | |
| # Dynamically inject the user's custom system instruction | |
| formatted_chat_prompt = f"System: {system_prompt}\nUser: {user_query}\nAssistant:" | |
| cmd = [ | |
| "python3", "run_inference.py", | |
| "-m", MODEL_PATH, | |
| "-p", formatted_chat_prompt, | |
| "-n", "120", | |
| "-temp", "0.4", | |
| "-t", "2" # Optimized for Hugging Face free-tier dual-core CPUs | |
| ] | |
| process = subprocess.Popen( | |
| cmd, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.DEVNULL, # Hide system logs | |
| text=True, | |
| bufsize=1 | |
| ) | |
| accumulator = "" | |
| prompt_cleared = False | |
| LOOKAHEAD_SIZE = 45 | |
| # These are the markers our Python function uses to slice the text | |
| stop_markers = [ | |
| "%^%^%^", | |
| "[end of text]", | |
| "User:", | |
| "Assistant:" | |
| ] | |
| while True: | |
| char = process.stdout.read(1) | |
| if not char: | |
| break | |
| accumulator += char | |
| # --- SWALLOW THE ECHOED PROMPT --- | |
| if not prompt_cleared: | |
| if "Assistant:" in accumulator: | |
| prompt_cleared = True | |
| accumulator = accumulator.split("Assistant:")[-1].lstrip() | |
| continue | |
| # --- THE CLEANING FUNCTION: Scan for structural boundaries --- | |
| stop_triggered = False | |
| for marker in stop_markers: | |
| if marker in accumulator: | |
| # The moment a marker is found, slice the text and trigger the kill switch | |
| accumulator = accumulator.split(marker)[0] | |
| stop_triggered = True | |
| break | |
| if stop_triggered: | |
| process.terminate() # Hard-kill the engine | |
| break | |
| # Stream text safely outside the trailing boundary window | |
| if len(accumulator) > LOOKAHEAD_SIZE: | |
| safe_to_display = accumulator[:len(accumulator) - LOOKAHEAD_SIZE] | |
| yield safe_to_display.strip() | |
| yield accumulator.strip() | |
| # ============================================================================== | |
| # TECHNICAL REPORT MARKDOWN TEXT | |
| # ============================================================================== | |
| TECHNICAL_REPORT_MD = """ | |
| ## 📋 Technical Report: 1-Bit LLM Socratic Refinement Pipeline | |
| **Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing | |
| --- | |
| ### 1. Executive Objective & Target Dataset | |
| The goal of this initiative was to engineer a hyper-lightweight, lightning-fast edge computing application capable of engaging users in conversational Socratic exploration. Traditional full-precision models require significant memory overhead to hold nuanced philosophical frameworks. This project focused on building an ultra-compressed conversational experience capable of executing inside a constrained local CPU footprint (e.g., standard consumer laptops or free cloud application tiers). | |
| * **Target Fine-Tuning Dataset:** `sanjaypantdsd/socratic-method-conversations` | |
| * **Data Characteristics:** High-quality, clean input-to-output mappings that translate explicit factual questions or structural concepts directly into clusters of exactly three open-ended, deeply analytical questions. | |
| --- | |
| ### 2. Model Training Matrix & Evaluation Phase | |
| Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs: | |
| | Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation | | |
| | :--- | :--- | :--- | :--- | | |
| | **st192011/bitnet-socratic-1.58b** | Full-precision parameter adjustments tailored to target dataset. | **Excellent** | Produced highly coherent Socratic question arrays aligning perfectly with training structures. | | |
| | **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. | | |
| #### Analysis of Quantization Collapse | |
| The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning. | |
| --- | |
| ### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring | |
| To avoid the quantization bugs of custom fine-tuned weights, we pivoted to a hybrid solution: **combining the official pretrained base weights from Microsoft with precision prompt engineering.** | |
| We deployed `microsoft/bitnet-b1.58-2B-4T-gguf`. While this preserved its foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.** | |
| #### The Stop-Token Anchor Hack | |
| To enforce structure, we modified the System Prompt to force the model to declare its own stopping point: | |
| > *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate %^%^%^"* | |
| This instruction forces the text-prediction engine to anchor itself on a predictable phrase. While the model still experiences trailing hallucinations, it prints a recognizable marker *immediately after* providing the high-quality questions. | |
| --- | |
| ### 4. Production Pipeline Architecture | |
| To deliver a flawless UX, we implemented a **Programmatic UX Stream Filter**: | |
| * **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI. | |
| * **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`). | |
| """ | |
| # ============================================================================== | |
| # GRADIO INTERFACE LAYOUT (TABBED WINDOWS) | |
| # ============================================================================== | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🧠 High-Performance 1-Bit AI Sandbox") | |
| with gr.Tabs(): | |
| # --- TAB 1: INTERACTIVE APP --- | |
| with gr.TabItem("Experimental Interface"): | |
| gr.Markdown("### Real-Time 1.58-bit Prompting Sandbox") | |
| gr.Markdown("Test the limits of Microsoft's BitNet GGUF kernel. Change the persona, modify the rules, and see how the Python cleaning function reacts in real-time.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🛠️ The \"Stop Token\" Hack") | |
| gr.Markdown( | |
| "**Base models don't know how to stop talking!**\n\n" | |
| "To prevent infinite loops, our system prompt instructs the model to literally type the words `%^%^%^` when it is finished. " | |
| "Our Python backend uses a **Lookahead Buffer** to watch for those words. If it sees them, it instantly slices them out and kills the engine.\n\n" | |
| "*🧪 Try deleting the words `'Then generate %^%^%^'` from the prompt below and see what happens!*" | |
| ) | |
| with gr.Column(scale=2): | |
| system_prompt_input = gr.Textbox( | |
| label="System Instruction (Editable)", | |
| value=DEFAULT_SYSTEM_PROMPT, | |
| lines=3 | |
| ) | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| input_text = gr.Textbox( | |
| label="User Query", | |
| placeholder="e.g., What makes something responsibility?", | |
| lines=2 | |
| ) | |
| submit_btn = gr.Button("Generate Response", variant="primary") | |
| with gr.Column(scale=5): | |
| output_text = gr.Textbox( | |
| label="Cleaned Real-Time Streaming Output", | |
| lines=8, | |
| interactive=False | |
| ) | |
| # Wire up the inputs to include the system prompt | |
| submit_btn.click(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text) | |
| input_text.submit(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text) | |
| # --- TAB 2: TECHNICAL REPORT --- | |
| with gr.TabItem("Technical Report"): | |
| gr.Markdown(TECHNICAL_REPORT_MD) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |