Spaces:

Vishwas1
/

bitnet_cpu_assistant

Running

File size: 5,340 Bytes

import streamlit as st
import psutil
from model_manager import BitNetManager

st.set_page_config(page_title="BitNet CPU Assistant", page_icon="🧠", layout="wide")

st.markdown("""

<style>

    .stApp { background-color: #0d1117; color: #c9d1d9; }

    .status-card {

        background: rgba(30, 41, 59, 0.5);

        border: 1px solid #30363d;

        border-radius: 10px;

        padding: 15px;

        margin-bottom: 10px;

    }

    .metric-value { color: #58a6ff; font-weight: bold; }

    h1, h2, h3 { color: #58a6ff; }

</style>

""", unsafe_allow_html=True)

st.title("🧠 BitNet CPU Assistant")
st.caption("Blazingly fast 1-bit LLM Inference on CPU-only Environments")

if "messages" not in st.session_state:
    st.session_state.messages = []

# Sidebar for controls and monitoring
with st.sidebar:
    st.header("⚙️ Settings")
    # Corrected IDs from the official setup_env.py usage
    model_options = {
        "1bitLLM/bitnet_b1_58-3B": "3B Optimized (Recommended)",
        "1bitLLM/bitnet_b1_58-large": "Large (Efficient)",
        "microsoft/BitNet-b1.58-2B-4T": "2B Specialized"
    }
    model_id = st.selectbox("Select Model", options=list(model_options.keys()), format_func=lambda x: model_options[x])
    
    st.header("📈 System Resources")
    cpu_usage = psutil.cpu_percent()
    ram_usage = psutil.virtual_memory().percent
    
    st.markdown(f"""

    <div class="status-card">

        CPU Usage: <span class="metric-value">{cpu_usage}%</span><br>

        RAM Usage: <span class="metric-value">{ram_usage}%</span>

    </div>

    """, unsafe_allow_html=True)
    
    if "engine_ready" not in st.session_state:
        st.session_state.engine_ready = False

    if st.button("🚀 Initialize Engine"):
        manager = BitNetManager()
        if manager.setup_engine(model_id=model_id):
            st.session_state.engine_ready = True
            st.success("BitNet.cpp Ready!")
        else:
            st.error("Engine setup failed. Check logs above.")

# Main Chat Interface
status_placeholder = st.empty()
if not st.session_state.engine_ready:
    status_placeholder.warning("⚠️ Engine not initialized. Please click 'Initialize Engine' in the sidebar to start.")

# Render message history
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Chat input
if prompt := st.chat_input("Ask me anything..."):
    if not st.session_state.engine_ready:
        st.error("You must initialize the engine before chatting.")
    else:
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.markdown(prompt)
        
        with st.chat_message("assistant"):
            message_placeholder = st.empty()
            full_response = ""
            
            manager = BitNetManager()
            model_path = manager.download_model(model_id=model_id, filename="ggml-model-i2_s.gguf")
        
        if model_path:
            with st.status("🚀 Initializing inference engine...") as status:
                # Execute real inference
                process = manager.run_inference(prompt, model_path)
                
                if process:
                    status.update(label="🧠 Generating response using 1-bit kernels...", state="running")
                    full_response = ""
                    log_buffer = []
                    
                    # Create an expander for engine logs
                    with st.expander("🛠️ Engine Metadata & Logs", expanded=False):
                        log_placeholder = st.empty()
                    
                    # Stream the output tokens
                    for line in process.stdout:
                        # Filter out engine metadata from the chat bubble
                        metadata_prefixes = ("llama", "main", "llm_", "llm-", "system_info", "sampler", "generate", "common", "BOS", "EOS", "UNK", "PAD")
                        if any(line.strip().startswith(p) for p in metadata_prefixes) or "..." in line or "init:" in line:
                            log_buffer.append(line)
                            log_placeholder.code("".join(log_buffer[-10:]))
                        else:
                            full_response += line
                            message_placeholder.markdown(full_response + "▌")
                    
                    status.update(label="✅ Generation Complete", state="complete")
                    
                    # Check for errors in stderr but don't clutter the chat
                    stderr = process.stderr.read()
                    if stderr:
                        log_buffer.append(f"\nSTDERR: {stderr}")
                        log_placeholder.code("".join(log_buffer[-15:]))
                else:
                    status.update(label="❌ Failed to launch engine", state="error")
                    full_response = "Failed to launch inference engine."
                    message_placeholder.markdown(full_response)
        else:
            st.error("Model not available.")
            
    st.session_state.messages.append({"role": "assistant", "content": full_response})