Spaces:

Vishwas1
/

bitnet_cpu_assistant

Running

App Files Files Community

bitnet_cpu_assistant / app.py

Vishwas1

Upload 5 files

6b3d51d verified about 1 month ago

raw

history blame contribute delete

5.34 kB

	import streamlit as st
	import psutil
	from model_manager import BitNetManager

	st.set_page_config(page_title="BitNet CPU Assistant", page_icon="🧠", layout="wide")

	st.markdown("""
	<style>
	.stApp { background-color: #0d1117; color: #c9d1d9; }
	.status-card {
	background: rgba(30, 41, 59, 0.5);
	border: 1px solid #30363d;
	border-radius: 10px;
	padding: 15px;
	margin-bottom: 10px;
	}
	.metric-value { color: #58a6ff; font-weight: bold; }
	h1, h2, h3 { color: #58a6ff; }
	</style>
	""", unsafe_allow_html=True)

	st.title("🧠 BitNet CPU Assistant")
	st.caption("Blazingly fast 1-bit LLM Inference on CPU-only Environments")

	if "messages" not in st.session_state:
	st.session_state.messages = []

	# Sidebar for controls and monitoring
	with st.sidebar:
	st.header("⚙️ Settings")
	# Corrected IDs from the official setup_env.py usage
	model_options = {
	"1bitLLM/bitnet_b1_58-3B": "3B Optimized (Recommended)",
	"1bitLLM/bitnet_b1_58-large": "Large (Efficient)",
	"microsoft/BitNet-b1.58-2B-4T": "2B Specialized"
	}
	model_id = st.selectbox("Select Model", options=list(model_options.keys()), format_func=lambda x: model_options[x])

	st.header("📈 System Resources")
	cpu_usage = psutil.cpu_percent()
	ram_usage = psutil.virtual_memory().percent

	st.markdown(f"""
	<div class="status-card">
	CPU Usage: <span class="metric-value">{cpu_usage}%</span><br>
	RAM Usage: <span class="metric-value">{ram_usage}%</span>
	</div>
	""", unsafe_allow_html=True)

	if "engine_ready" not in st.session_state:
	st.session_state.engine_ready = False

	if st.button("🚀 Initialize Engine"):
	manager = BitNetManager()
	if manager.setup_engine(model_id=model_id):
	st.session_state.engine_ready = True
	st.success("BitNet.cpp Ready!")
	else:
	st.error("Engine setup failed. Check logs above.")

	# Main Chat Interface
	status_placeholder = st.empty()
	if not st.session_state.engine_ready:
	status_placeholder.warning("⚠️ Engine not initialized. Please click 'Initialize Engine' in the sidebar to start.")

	# Render message history
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Chat input
	if prompt := st.chat_input("Ask me anything..."):
	if not st.session_state.engine_ready:
	st.error("You must initialize the engine before chatting.")
	else:
	st.session_state.messages.append({"role": "user", "content": prompt})
	with st.chat_message("user"):
	st.markdown(prompt)

	with st.chat_message("assistant"):
	message_placeholder = st.empty()
	full_response = ""

	manager = BitNetManager()
	model_path = manager.download_model(model_id=model_id, filename="ggml-model-i2_s.gguf")

	if model_path:
	with st.status("🚀 Initializing inference engine...") as status:
	# Execute real inference
	process = manager.run_inference(prompt, model_path)

	if process:
	status.update(label="🧠 Generating response using 1-bit kernels...", state="running")
	full_response = ""
	log_buffer = []

	# Create an expander for engine logs
	with st.expander("🛠️ Engine Metadata & Logs", expanded=False):
	log_placeholder = st.empty()

	# Stream the output tokens
	for line in process.stdout:
	# Filter out engine metadata from the chat bubble
	metadata_prefixes = ("llama", "main", "llm_", "llm-", "system_info", "sampler", "generate", "common", "BOS", "EOS", "UNK", "PAD")
	if any(line.strip().startswith(p) for p in metadata_prefixes) or "..." in line or "init:" in line:
	log_buffer.append(line)
	log_placeholder.code("".join(log_buffer[-10:]))
	else:
	full_response += line
	message_placeholder.markdown(full_response + "▌")

	status.update(label="✅ Generation Complete", state="complete")

	# Check for errors in stderr but don't clutter the chat
	stderr = process.stderr.read()
	if stderr:
	log_buffer.append(f"\nSTDERR: {stderr}")
	log_placeholder.code("".join(log_buffer[-15:]))
	else:
	status.update(label="❌ Failed to launch engine", state="error")
	full_response = "Failed to launch inference engine."
	message_placeholder.markdown(full_response)
	else:
	st.error("Model not available.")

	st.session_state.messages.append({"role": "assistant", "content": full_response})