Spaces:
Running
Running
File size: 5,340 Bytes
2544845 804d64f 2544845 804d64f 203a7b1 b4c8d46 203a7b1 804d64f 203a7b1 2544845 203a7b1 2544845 203a7b1 2544845 203a7b1 804d64f 2544845 804d64f 2544845 6b3d51d 804d64f 6b3d51d 2544845 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | import streamlit as st
import psutil
from model_manager import BitNetManager
st.set_page_config(page_title="BitNet CPU Assistant", page_icon="๐ง ", layout="wide")
st.markdown("""
<style>
.stApp { background-color: #0d1117; color: #c9d1d9; }
.status-card {
background: rgba(30, 41, 59, 0.5);
border: 1px solid #30363d;
border-radius: 10px;
padding: 15px;
margin-bottom: 10px;
}
.metric-value { color: #58a6ff; font-weight: bold; }
h1, h2, h3 { color: #58a6ff; }
</style>
""", unsafe_allow_html=True)
st.title("๐ง BitNet CPU Assistant")
st.caption("Blazingly fast 1-bit LLM Inference on CPU-only Environments")
if "messages" not in st.session_state:
st.session_state.messages = []
# Sidebar for controls and monitoring
with st.sidebar:
st.header("โ๏ธ Settings")
# Corrected IDs from the official setup_env.py usage
model_options = {
"1bitLLM/bitnet_b1_58-3B": "3B Optimized (Recommended)",
"1bitLLM/bitnet_b1_58-large": "Large (Efficient)",
"microsoft/BitNet-b1.58-2B-4T": "2B Specialized"
}
model_id = st.selectbox("Select Model", options=list(model_options.keys()), format_func=lambda x: model_options[x])
st.header("๐ System Resources")
cpu_usage = psutil.cpu_percent()
ram_usage = psutil.virtual_memory().percent
st.markdown(f"""
<div class="status-card">
CPU Usage: <span class="metric-value">{cpu_usage}%</span><br>
RAM Usage: <span class="metric-value">{ram_usage}%</span>
</div>
""", unsafe_allow_html=True)
if "engine_ready" not in st.session_state:
st.session_state.engine_ready = False
if st.button("๐ Initialize Engine"):
manager = BitNetManager()
if manager.setup_engine(model_id=model_id):
st.session_state.engine_ready = True
st.success("BitNet.cpp Ready!")
else:
st.error("Engine setup failed. Check logs above.")
# Main Chat Interface
status_placeholder = st.empty()
if not st.session_state.engine_ready:
status_placeholder.warning("โ ๏ธ Engine not initialized. Please click 'Initialize Engine' in the sidebar to start.")
# Render message history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Chat input
if prompt := st.chat_input("Ask me anything..."):
if not st.session_state.engine_ready:
st.error("You must initialize the engine before chatting.")
else:
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = ""
manager = BitNetManager()
model_path = manager.download_model(model_id=model_id, filename="ggml-model-i2_s.gguf")
if model_path:
with st.status("๐ Initializing inference engine...") as status:
# Execute real inference
process = manager.run_inference(prompt, model_path)
if process:
status.update(label="๐ง Generating response using 1-bit kernels...", state="running")
full_response = ""
log_buffer = []
# Create an expander for engine logs
with st.expander("๐ ๏ธ Engine Metadata & Logs", expanded=False):
log_placeholder = st.empty()
# Stream the output tokens
for line in process.stdout:
# Filter out engine metadata from the chat bubble
metadata_prefixes = ("llama", "main", "llm_", "llm-", "system_info", "sampler", "generate", "common", "BOS", "EOS", "UNK", "PAD")
if any(line.strip().startswith(p) for p in metadata_prefixes) or "..." in line or "init:" in line:
log_buffer.append(line)
log_placeholder.code("".join(log_buffer[-10:]))
else:
full_response += line
message_placeholder.markdown(full_response + "โ")
status.update(label="โ
Generation Complete", state="complete")
# Check for errors in stderr but don't clutter the chat
stderr = process.stderr.read()
if stderr:
log_buffer.append(f"\nSTDERR: {stderr}")
log_placeholder.code("".join(log_buffer[-15:]))
else:
status.update(label="โ Failed to launch engine", state="error")
full_response = "Failed to launch inference engine."
message_placeholder.markdown(full_response)
else:
st.error("Model not available.")
st.session_state.messages.append({"role": "assistant", "content": full_response})
|