import gradio as gr import subprocess import os # Ensure execution context is inside the compiled architecture directory if os.path.exists("/content/BitNet"): os.chdir("/content/BitNet") # ============================================================================== # CONSTANTS & CONFIGURATION # ============================================================================== MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf" DEFAULT_SYSTEM_PROMPT = ( "You are a Socratic assistant. Do not answer questions directly. " "Instead, respond exclusively with 3 deep, reflective questions. " "Then generate %^%^%^" ) # ============================================================================== # STREAMING ENGINE WITH LOOKAHEAD BUFFER # ============================================================================== def streaming_chat(user_query, system_prompt): if not user_query.strip(): yield "Please enter a valid question." return # Dynamically inject the user's custom system instruction formatted_chat_prompt = f"System: {system_prompt}\nUser: {user_query}\nAssistant:" cmd = [ "python3", "run_inference.py", "-m", MODEL_PATH, "-p", formatted_chat_prompt, "-n", "120", "-temp", "0.4", "-t", "2" # Optimized for Hugging Face free-tier dual-core CPUs ] process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, # Hide system logs text=True, bufsize=1 ) accumulator = "" prompt_cleared = False LOOKAHEAD_SIZE = 45 # These are the markers our Python function uses to slice the text stop_markers = [ "%^%^%^", "[end of text]", "User:", "Assistant:" ] while True: char = process.stdout.read(1) if not char: break accumulator += char # --- SWALLOW THE ECHOED PROMPT --- if not prompt_cleared: if "Assistant:" in accumulator: prompt_cleared = True accumulator = accumulator.split("Assistant:")[-1].lstrip() continue # --- THE CLEANING FUNCTION: Scan for structural boundaries --- stop_triggered = False for marker in stop_markers: if marker in accumulator: # The moment a marker is found, slice the text and trigger the kill switch accumulator = accumulator.split(marker)[0] stop_triggered = True break if stop_triggered: process.terminate() # Hard-kill the engine break # Stream text safely outside the trailing boundary window if len(accumulator) > LOOKAHEAD_SIZE: safe_to_display = accumulator[:len(accumulator) - LOOKAHEAD_SIZE] yield safe_to_display.strip() yield accumulator.strip() # ============================================================================== # TECHNICAL REPORT MARKDOWN TEXT # ============================================================================== TECHNICAL_REPORT_MD = """ ## ๐Ÿ“‹ Technical Report: 1-Bit LLM Socratic Refinement Pipeline **Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing --- ### 1. Executive Objective & Target Dataset The goal of this initiative was to engineer a hyper-lightweight, lightning-fast edge computing application capable of engaging users in conversational Socratic exploration. Traditional full-precision models require significant memory overhead to hold nuanced philosophical frameworks. This project focused on building an ultra-compressed conversational experience capable of executing inside a constrained local CPU footprint (e.g., standard consumer laptops or free cloud application tiers). * **Target Fine-Tuning Dataset:** `sanjaypantdsd/socratic-method-conversations` * **Data Characteristics:** High-quality, clean input-to-output mappings that translate explicit factual questions or structural concepts directly into clusters of exactly three open-ended, deeply analytical questions. --- ### 2. Model Training Matrix & Evaluation Phase Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs: | Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation | | :--- | :--- | :--- | :--- | | **st192011/bitnet-socratic-1.58b** | Full-precision parameter adjustments tailored to target dataset. | **Excellent** | Produced highly coherent Socratic question arrays aligning perfectly with training structures. | | **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. | #### Analysis of Quantization Collapse The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning. --- ### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring To avoid the quantization bugs of custom fine-tuned weights, we pivoted to a hybrid solution: **combining the official pretrained base weights from Microsoft with precision prompt engineering.** We deployed `microsoft/bitnet-b1.58-2B-4T-gguf`. While this preserved its foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.** #### The Stop-Token Anchor Hack To enforce structure, we modified the System Prompt to force the model to declare its own stopping point: > *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate %^%^%^"* This instruction forces the text-prediction engine to anchor itself on a predictable phrase. While the model still experiences trailing hallucinations, it prints a recognizable marker *immediately after* providing the high-quality questions. --- ### 4. Production Pipeline Architecture To deliver a flawless UX, we implemented a **Programmatic UX Stream Filter**: * **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI. * **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`). """ # ============================================================================== # GRADIO INTERFACE LAYOUT (TABBED WINDOWS) # ============================================================================== with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# ๐Ÿง  High-Performance 1-Bit AI Sandbox") with gr.Tabs(): # --- TAB 1: INTERACTIVE APP --- with gr.TabItem("Experimental Interface"): gr.Markdown("### Real-Time 1.58-bit Prompting Sandbox") gr.Markdown("Test the limits of Microsoft's BitNet GGUF kernel. Change the persona, modify the rules, and see how the Python cleaning function reacts in real-time.") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### ๐Ÿ› ๏ธ The \"Stop Token\" Hack") gr.Markdown( "**Base models don't know how to stop talking!**\n\n" "To prevent infinite loops, our system prompt instructs the model to literally type the words `%^%^%^` when it is finished. " "Our Python backend uses a **Lookahead Buffer** to watch for those words. If it sees them, it instantly slices them out and kills the engine.\n\n" "*๐Ÿงช Try deleting the words `'Then generate %^%^%^'` from the prompt below and see what happens!*" ) with gr.Column(scale=2): system_prompt_input = gr.Textbox( label="System Instruction (Editable)", value=DEFAULT_SYSTEM_PROMPT, lines=3 ) gr.Markdown("---") with gr.Row(): with gr.Column(scale=4): input_text = gr.Textbox( label="User Query", placeholder="e.g., What makes something responsibility?", lines=2 ) submit_btn = gr.Button("Generate Response", variant="primary") with gr.Column(scale=5): output_text = gr.Textbox( label="Cleaned Real-Time Streaming Output", lines=8, interactive=False ) # Wire up the inputs to include the system prompt submit_btn.click(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text) input_text.submit(fn=streaming_chat, inputs=[input_text, system_prompt_input], outputs=output_text) # --- TAB 2: TECHNICAL REPORT --- with gr.TabItem("Technical Report"): gr.Markdown(TECHNICAL_REPORT_MD) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)