Vishwas1's picture
Upload 5 files
6b3d51d verified
import streamlit as st
import psutil
from model_manager import BitNetManager
st.set_page_config(page_title="BitNet CPU Assistant", page_icon="🧠", layout="wide")
st.markdown("""
<style>
.stApp { background-color: #0d1117; color: #c9d1d9; }
.status-card {
background: rgba(30, 41, 59, 0.5);
border: 1px solid #30363d;
border-radius: 10px;
padding: 15px;
margin-bottom: 10px;
}
.metric-value { color: #58a6ff; font-weight: bold; }
h1, h2, h3 { color: #58a6ff; }
</style>
""", unsafe_allow_html=True)
st.title("🧠 BitNet CPU Assistant")
st.caption("Blazingly fast 1-bit LLM Inference on CPU-only Environments")
if "messages" not in st.session_state:
st.session_state.messages = []
# Sidebar for controls and monitoring
with st.sidebar:
st.header("βš™οΈ Settings")
# Corrected IDs from the official setup_env.py usage
model_options = {
"1bitLLM/bitnet_b1_58-3B": "3B Optimized (Recommended)",
"1bitLLM/bitnet_b1_58-large": "Large (Efficient)",
"microsoft/BitNet-b1.58-2B-4T": "2B Specialized"
}
model_id = st.selectbox("Select Model", options=list(model_options.keys()), format_func=lambda x: model_options[x])
st.header("πŸ“ˆ System Resources")
cpu_usage = psutil.cpu_percent()
ram_usage = psutil.virtual_memory().percent
st.markdown(f"""
<div class="status-card">
CPU Usage: <span class="metric-value">{cpu_usage}%</span><br>
RAM Usage: <span class="metric-value">{ram_usage}%</span>
</div>
""", unsafe_allow_html=True)
if "engine_ready" not in st.session_state:
st.session_state.engine_ready = False
if st.button("πŸš€ Initialize Engine"):
manager = BitNetManager()
if manager.setup_engine(model_id=model_id):
st.session_state.engine_ready = True
st.success("BitNet.cpp Ready!")
else:
st.error("Engine setup failed. Check logs above.")
# Main Chat Interface
status_placeholder = st.empty()
if not st.session_state.engine_ready:
status_placeholder.warning("⚠️ Engine not initialized. Please click 'Initialize Engine' in the sidebar to start.")
# Render message history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Chat input
if prompt := st.chat_input("Ask me anything..."):
if not st.session_state.engine_ready:
st.error("You must initialize the engine before chatting.")
else:
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = ""
manager = BitNetManager()
model_path = manager.download_model(model_id=model_id, filename="ggml-model-i2_s.gguf")
if model_path:
with st.status("πŸš€ Initializing inference engine...") as status:
# Execute real inference
process = manager.run_inference(prompt, model_path)
if process:
status.update(label="🧠 Generating response using 1-bit kernels...", state="running")
full_response = ""
log_buffer = []
# Create an expander for engine logs
with st.expander("πŸ› οΈ Engine Metadata & Logs", expanded=False):
log_placeholder = st.empty()
# Stream the output tokens
for line in process.stdout:
# Filter out engine metadata from the chat bubble
metadata_prefixes = ("llama", "main", "llm_", "llm-", "system_info", "sampler", "generate", "common", "BOS", "EOS", "UNK", "PAD")
if any(line.strip().startswith(p) for p in metadata_prefixes) or "..." in line or "init:" in line:
log_buffer.append(line)
log_placeholder.code("".join(log_buffer[-10:]))
else:
full_response += line
message_placeholder.markdown(full_response + "β–Œ")
status.update(label="βœ… Generation Complete", state="complete")
# Check for errors in stderr but don't clutter the chat
stderr = process.stderr.read()
if stderr:
log_buffer.append(f"\nSTDERR: {stderr}")
log_placeholder.code("".join(log_buffer[-15:]))
else:
status.update(label="❌ Failed to launch engine", state="error")
full_response = "Failed to launch inference engine."
message_placeholder.markdown(full_response)
else:
st.error("Model not available.")
st.session_state.messages.append({"role": "assistant", "content": full_response})