st192011 commited on
Commit
1fc5cde
·
verified ·
1 Parent(s): 6f7a25d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -39
app.py CHANGED
@@ -2,6 +2,13 @@ import gradio as gr
2
  import subprocess
3
  import os
4
 
 
 
 
 
 
 
 
5
  SYSTEM_INSTRUCTION = (
6
  "You are a Socratic assistant. Do not answer questions directly. "
7
  "Instead, respond exclusively with 3 deep, reflective questions. "
@@ -9,60 +16,161 @@ SYSTEM_INSTRUCTION = (
9
  )
10
  MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
11
 
12
- def clean_socratic_output(raw_stdout, full_prompt):
13
- generated_text = raw_stdout.replace(full_prompt, "").strip()
14
- stop_markers = ["Stop token", "stop token", "Stop.", "stop.", "Response:", "Response"]
15
-
16
- lowest_index = len(generated_text)
17
- marker_found = False
18
- for marker in stop_markers:
19
- idx = generated_text.find(marker)
20
- if idx != -1 and idx < lowest_index:
21
- lowest_index = idx
22
- marker_found = True
23
-
24
- if marker_found:
25
- generated_text = generated_text[:lowest_index].strip()
26
- if generated_text.endswith(("\n4.", "\n4", "4.", "4")):
27
- generated_text = generated_text.rsplit("\n4", 1)[0].strip()
28
- return generated_text
29
-
30
- def socratic_chat(user_query):
31
  if not user_query.strip():
32
- return "Please enter a valid question."
33
-
 
34
  formatted_chat_prompt = f"System: {SYSTEM_INSTRUCTION}\nUser: {user_query}\nAssistant:"
35
 
36
  cmd = [
37
  "python3", "run_inference.py",
38
  "-m", MODEL_PATH,
39
  "-p", formatted_chat_prompt,
40
- "-n", "100",
41
  "-temp", "0.4",
42
- "-t", "2" # Matches HF Space 2-vCPU hardware allocation
43
  ]
44
 
45
- execution_result = subprocess.run(cmd, capture_output=True, text=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- if execution_result.stdout:
48
- return clean_socratic_output(execution_result.stdout, formatted_chat_prompt)
49
- else:
50
- return f"Error running model: {execution_result.stderr}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # Build the Gradio UI Layout
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
54
- gr.Markdown("# 🧠 1-Bit Local Socratic Assistant")
55
- gr.Markdown("This interface is running an official Microsoft BitNet 1.58-bit model on an optimized CPU backend.")
56
 
57
- with gr.Row():
58
- with gr.Column():
59
- input_text = gr.Textbox(label="What concept are you exploring?", placeholder="e.g., What is justice?")
60
- submit_btn = gr.Button("Generate Socratic Reflection", variant="primary")
61
- with gr.Column():
62
- output_text = gr.Textbox(label="Socratic Feedback", lines=6)
63
 
64
- submit_btn.click(fn=socratic_chat, inputs=input_text, outputs=output_text)
65
- input_text.submit(fn=socratic_chat, inputs=input_text, outputs=output_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  if __name__ == "__main__":
68
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
2
  import subprocess
3
  import os
4
 
5
+ # Ensure execution context is inside the compiled architecture directory
6
+ if os.path.exists("/content/BitNet"):
7
+ os.chdir("/content/BitNet")
8
+
9
+ # ==============================================================================
10
+ # CONSTANTS & CONFIGURATION
11
+ # ==============================================================================
12
  SYSTEM_INSTRUCTION = (
13
  "You are a Socratic assistant. Do not answer questions directly. "
14
  "Instead, respond exclusively with 3 deep, reflective questions. "
 
16
  )
17
  MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
18
 
19
+ # ==============================================================================
20
+ # STREAMING ENGINE WITH LOOKAHEAD BUFFER
21
+ # ==============================================================================
22
+ def socratic_streaming_chat(user_query):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  if not user_query.strip():
24
+ yield "Please enter a valid question."
25
+ return
26
+
27
  formatted_chat_prompt = f"System: {SYSTEM_INSTRUCTION}\nUser: {user_query}\nAssistant:"
28
 
29
  cmd = [
30
  "python3", "run_inference.py",
31
  "-m", MODEL_PATH,
32
  "-p", formatted_chat_prompt,
33
+ "-n", "120",
34
  "-temp", "0.4",
35
+ "-t", "2" # Optimized for Hugging Face free-tier dual-core CPUs
36
  ]
37
 
38
+ # Spawn the process with stdout piping enabled for live stream reading
39
+ process = subprocess.Popen(
40
+ cmd,
41
+ stdout=subprocess.PIPE,
42
+ stderr=subprocess.STDOUT,
43
+ text=True,
44
+ bufsize=1
45
+ )
46
+
47
+ accumulator = ""
48
+ # Lookahead buffer size in characters (~45 chars safely covers variance of stop loops)
49
+ LOOKAHEAD_SIZE = 45
50
+
51
+ stop_markers = [
52
+ "Stop token", "stop token",
53
+ "Stop.", "stop.",
54
+ "Response:", "Response",
55
+ "Assistant:"
56
+ ]
57
 
58
+ # Read the terminal execution stream live character-by-character
59
+ while True:
60
+ char = process.stdout.read(1)
61
+ if not char:
62
+ break # End of stream reached
63
+
64
+ accumulator += char
65
+
66
+ # Discard the echoed prompt wrapper if it slips into the stdout read window
67
+ if accumulator.startswith(formatted_chat_prompt):
68
+ accumulator = accumulator.replace(formatted_chat_prompt, "").strip()
69
+
70
+ # Scan the live accumulator for structural collapse boundaries
71
+ stop_triggered = False
72
+ for marker in stop_markers:
73
+ if marker in accumulator:
74
+ accumulator = accumulator.split(marker)[0]
75
+ stop_triggered = True
76
+ break
77
+
78
+ if stop_triggered:
79
+ process.terminate() # Hard-kill the engine to stop burning CPU cycles
80
+ break
81
+
82
+ # UX Lookahead Protection: Only stream text that sits safely outside the trailing boundary window
83
+ if len(accumulator) > LOOKAHEAD_SIZE:
84
+ safe_to_display = accumulator[:len(accumulator) - LOOKAHEAD_SIZE]
85
+ yield safe_to_display.strip()
86
+
87
+ # Yield the completely un-buffered, finalized clean string
88
+ yield accumulator.strip()
89
+
90
+ # ==============================================================================
91
+ # TECHNICAL REPORT MARKDOWN TEXT
92
+ # ==============================================================================
93
+ TECHNICAL_REPORT_MD = """
94
+ ## 📋 Project Technical Report: 1-Bit LLM Socratic Refinement Pipeline
95
+ **Architecture Core:** Ternary Quantized (1.58-bit) Matrix Processing
96
+
97
+ ---
98
 
99
+ ### 1. Executive Objective & Target Dataset
100
+ The goal of this initiative was to engineer a hyper-lightweight, lightning-fast edge computing application capable of engaging users in conversational Socratic exploration. Traditional full-precision models require significant memory overhead to hold nuanced philosophical frameworks. This project focused on building an ultra-compressed conversational experience capable of executing inside a constrained local CPU footprint (e.g., standard consumer laptops or free cloud application tiers).
101
+
102
+ * **Target Fine-Tuning Dataset:** `sanjaypantdsd/socratic-method-conversations`
103
+ * **Data Characteristics:** High-quality, clean input-to-output mappings that translate explicit factual questions or structural concepts directly into clusters of exactly three open-ended, deeply analytical questions.
104
+
105
+ ---
106
+
107
+ ### 2. Model Training Matrix & Evaluation Phase
108
+ Our initial strategy focused on fine-tuning custom models directly on our targeted Socratic dataset. The results exposed clear engineering trade-offs between unquantized fine-tuning weight adjustments and customized binary compilation layers:
109
+
110
+ | Model Identifier | Architecture Configuration | Operational Performance | Qualitative Evaluation |
111
+ | :--- | :--- | :--- | :--- |
112
+ | **st192011/bitnet-socratic-1.58b** | Full-precision parameter adjustments tailored to target dataset. | **Excellent** | Produced highly coherent Socratic question arrays aligning perfectly with training structures. |
113
+ | **st192011/socratic-bitnet-2b** | Quantized Ternary Representation Variant of custom weights. | **Critically Poor** | Suffered extreme degradation. The model experienced severe structural collapse, outputting infinite semantic loops or unreadable token gibberish. |
114
+
115
+ #### Analysis of Quantization Collapse
116
+ The stark failure of `st192011/socratic-bitnet-2b` highlights a common hurdle in customized 1-bit AI development. When a model's weights are aggressively compressed down to simple ternary values (-1, 0, 1), the mathematical boundaries become extremely rigid. Standard quantization tools often distort the delicate behavioral traits introduced during fine-tuning, leading to a complete breakdown of language modeling capabilities.
117
+
118
+ ---
119
+
120
+ ### 3. Strategy Pivot: Pretrained Weights + Structural Prompt Anchoring
121
+ To avoid the quantization bugs of custom fine-tuned weights, we pivoted to an elegant hybrid solution: **combining the official, verified pretrained base weights from Microsoft with precision prompt engineering.**
122
+
123
+ We deployed the official `microsoft/bitnet-b1.58-2B-4T-gguf` base model. While this preserved its deep, foundational knowledge base, it introduced a new challenge: **Base models do not natively know when to stop generating.**
124
+
125
+ #### The Stop-Token Anchor Hack
126
+ To enforce structure without re-training the model, we modified the `SYSTEM_INSTRUCTION` block to force the model to declare its own stopping point:
127
+ > *"You are a Socratic assistant... Respond exclusively with 3 deep, reflective questions. Then generate stop token"*
128
+
129
+ This instruction forces the model's text-prediction engine to anchor itself on a predictable phrase as soon as its linguistic objective is met. Our test iterations confirmed that while the model still experiences trailing token hallucinations (e.g., repeating `Stop. Stop. Stop. Response: 1.`), it prints a recognizable marker *immediately after* providing the high-quality questions.
130
+
131
+ ---
132
+
133
+ ### 4. Production Pipeline Architecture
134
+ To deliver a flawless user experience, we implemented a **Programmatic UX Stream Filter** running on the host system. This layer completely isolates the user from any underlying engine instability:
135
+
136
+ * **The Lookahead Buffer Zone:** The streaming engine retains the trailing 45 characters of generation inside a private memory array, evaluating it for known stop-sequences before releasing clean text to the UI.
137
+ * **Process Resource Reclamation:** The moment a marker is tripped, a background system command kills the active process (`process.terminate()`). This prevents the model from wasting CPU cycles on hallucinated loops, maximizing host performance.
138
+ * **Flawless Formatting Output:** The final user interface performs at near real-time speeds on commodity hardware, delivering clean, high-precision Socratic prompts with zero visual clutter.
139
+ """
140
+
141
+ # ==============================================================================
142
+ # GRADIO INTERFACE LAYOUT (TABBED WINDOWS)
143
+ # ==============================================================================
144
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
145
+ gr.Markdown("# 🧠 High-Performance 1-Bit Socratic Workspace")
 
146
 
147
+ with gr.Tabs():
148
+ # --- TAB 1: INTERACTIVE APP ---
149
+ with gr.TabItem("Socratic Assistant"):
150
+ gr.Markdown("### Real-Time Socratic Exploration")
151
+ gr.Markdown("Real-time token streaming powered by Microsoft's 1.58-bit BitNet GGUF kernel alongside dynamic programmatic output filtering.")
 
152
 
153
+ with gr.Row():
154
+ with gr.Column(scale=4):
155
+ input_text = gr.Textbox(
156
+ label="Concept Prompt",
157
+ placeholder="What concept do you wish to dissect? (e.g., What makes something responsibility?)",
158
+ lines=2
159
+ )
160
+ submit_btn = gr.Button("Dissect Concept via Socratic Dialogue", variant="primary")
161
+ with gr.Column(scale=5):
162
+ output_text = gr.Textbox(
163
+ label="Cleaned Real-Time Streaming Output",
164
+ lines=8,
165
+ interactive=False
166
+ )
167
+
168
+ submit_btn.click(fn=socratic_streaming_chat, inputs=input_text, outputs=output_text)
169
+ input_text.submit(fn=socratic_streaming_chat, inputs=input_text, outputs=output_text)
170
+
171
+ # --- TAB 2: TECHNICAL REPORT ---
172
+ with gr.TabItem("Technical Report"):
173
+ gr.Markdown(TECHNICAL_REPORT_MD)
174
 
175
  if __name__ == "__main__":
176
  demo.launch(server_name="0.0.0.0", server_port=7860)