import streamlit as st import psutil from model_manager import BitNetManager st.set_page_config(page_title="BitNet CPU Assistant", page_icon="🧠", layout="wide") st.markdown(""" """, unsafe_allow_html=True) st.title("🧠 BitNet CPU Assistant") st.caption("Blazingly fast 1-bit LLM Inference on CPU-only Environments") if "messages" not in st.session_state: st.session_state.messages = [] # Sidebar for controls and monitoring with st.sidebar: st.header("⚙️ Settings") # Corrected IDs from the official setup_env.py usage model_options = { "1bitLLM/bitnet_b1_58-3B": "3B Optimized (Recommended)", "1bitLLM/bitnet_b1_58-large": "Large (Efficient)", "microsoft/BitNet-b1.58-2B-4T": "2B Specialized" } model_id = st.selectbox("Select Model", options=list(model_options.keys()), format_func=lambda x: model_options[x]) st.header("📈 System Resources") cpu_usage = psutil.cpu_percent() ram_usage = psutil.virtual_memory().percent st.markdown(f"""
CPU Usage: {cpu_usage}%
RAM Usage: {ram_usage}%
""", unsafe_allow_html=True) if "engine_ready" not in st.session_state: st.session_state.engine_ready = False if st.button("🚀 Initialize Engine"): manager = BitNetManager() if manager.setup_engine(model_id=model_id): st.session_state.engine_ready = True st.success("BitNet.cpp Ready!") else: st.error("Engine setup failed. Check logs above.") # Main Chat Interface status_placeholder = st.empty() if not st.session_state.engine_ready: status_placeholder.warning("⚠️ Engine not initialized. Please click 'Initialize Engine' in the sidebar to start.") # Render message history for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) # Chat input if prompt := st.chat_input("Ask me anything..."): if not st.session_state.engine_ready: st.error("You must initialize the engine before chatting.") else: st.session_state.messages.append({"role": "user", "content": prompt}) with st.chat_message("user"): st.markdown(prompt) with st.chat_message("assistant"): message_placeholder = st.empty() full_response = "" manager = BitNetManager() model_path = manager.download_model(model_id=model_id, filename="ggml-model-i2_s.gguf") if model_path: with st.status("🚀 Initializing inference engine...") as status: # Execute real inference process = manager.run_inference(prompt, model_path) if process: status.update(label="🧠 Generating response using 1-bit kernels...", state="running") full_response = "" log_buffer = [] # Create an expander for engine logs with st.expander("🛠️ Engine Metadata & Logs", expanded=False): log_placeholder = st.empty() # Stream the output tokens for line in process.stdout: # Filter out engine metadata from the chat bubble metadata_prefixes = ("llama", "main", "llm_", "llm-", "system_info", "sampler", "generate", "common", "BOS", "EOS", "UNK", "PAD") if any(line.strip().startswith(p) for p in metadata_prefixes) or "..." in line or "init:" in line: log_buffer.append(line) log_placeholder.code("".join(log_buffer[-10:])) else: full_response += line message_placeholder.markdown(full_response + "▌") status.update(label="✅ Generation Complete", state="complete") # Check for errors in stderr but don't clutter the chat stderr = process.stderr.read() if stderr: log_buffer.append(f"\nSTDERR: {stderr}") log_placeholder.code("".join(log_buffer[-15:])) else: status.update(label="❌ Failed to launch engine", state="error") full_response = "Failed to launch inference engine." message_placeholder.markdown(full_response) else: st.error("Model not available.") st.session_state.messages.append({"role": "assistant", "content": full_response})