File size: 5,340 Bytes
2544845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804d64f
 
 
 
 
 
 
2544845
 
 
 
 
 
 
 
 
 
 
 
804d64f
 
 
203a7b1
 
b4c8d46
203a7b1
 
804d64f
203a7b1
2544845
 
203a7b1
 
 
 
 
2544845
 
 
 
203a7b1
2544845
203a7b1
 
804d64f
 
 
 
2544845
804d64f
 
 
 
 
 
2544845
 
6b3d51d
 
 
804d64f
6b3d51d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2544845
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import streamlit as st
import psutil
from model_manager import BitNetManager

st.set_page_config(page_title="BitNet CPU Assistant", page_icon="๐Ÿง ", layout="wide")

st.markdown("""

<style>

    .stApp { background-color: #0d1117; color: #c9d1d9; }

    .status-card {

        background: rgba(30, 41, 59, 0.5);

        border: 1px solid #30363d;

        border-radius: 10px;

        padding: 15px;

        margin-bottom: 10px;

    }

    .metric-value { color: #58a6ff; font-weight: bold; }

    h1, h2, h3 { color: #58a6ff; }

</style>

""", unsafe_allow_html=True)

st.title("๐Ÿง  BitNet CPU Assistant")
st.caption("Blazingly fast 1-bit LLM Inference on CPU-only Environments")

if "messages" not in st.session_state:
    st.session_state.messages = []

# Sidebar for controls and monitoring
with st.sidebar:
    st.header("โš™๏ธ Settings")
    # Corrected IDs from the official setup_env.py usage
    model_options = {
        "1bitLLM/bitnet_b1_58-3B": "3B Optimized (Recommended)",
        "1bitLLM/bitnet_b1_58-large": "Large (Efficient)",
        "microsoft/BitNet-b1.58-2B-4T": "2B Specialized"
    }
    model_id = st.selectbox("Select Model", options=list(model_options.keys()), format_func=lambda x: model_options[x])
    
    st.header("๐Ÿ“ˆ System Resources")
    cpu_usage = psutil.cpu_percent()
    ram_usage = psutil.virtual_memory().percent
    
    st.markdown(f"""

    <div class="status-card">

        CPU Usage: <span class="metric-value">{cpu_usage}%</span><br>

        RAM Usage: <span class="metric-value">{ram_usage}%</span>

    </div>

    """, unsafe_allow_html=True)
    
    if "engine_ready" not in st.session_state:
        st.session_state.engine_ready = False

    if st.button("๐Ÿš€ Initialize Engine"):
        manager = BitNetManager()
        if manager.setup_engine(model_id=model_id):
            st.session_state.engine_ready = True
            st.success("BitNet.cpp Ready!")
        else:
            st.error("Engine setup failed. Check logs above.")

# Main Chat Interface
status_placeholder = st.empty()
if not st.session_state.engine_ready:
    status_placeholder.warning("โš ๏ธ Engine not initialized. Please click 'Initialize Engine' in the sidebar to start.")

# Render message history
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Chat input
if prompt := st.chat_input("Ask me anything..."):
    if not st.session_state.engine_ready:
        st.error("You must initialize the engine before chatting.")
    else:
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.markdown(prompt)
        
        with st.chat_message("assistant"):
            message_placeholder = st.empty()
            full_response = ""
            
            manager = BitNetManager()
            model_path = manager.download_model(model_id=model_id, filename="ggml-model-i2_s.gguf")
        
        if model_path:
            with st.status("๐Ÿš€ Initializing inference engine...") as status:
                # Execute real inference
                process = manager.run_inference(prompt, model_path)
                
                if process:
                    status.update(label="๐Ÿง  Generating response using 1-bit kernels...", state="running")
                    full_response = ""
                    log_buffer = []
                    
                    # Create an expander for engine logs
                    with st.expander("๐Ÿ› ๏ธ Engine Metadata & Logs", expanded=False):
                        log_placeholder = st.empty()
                    
                    # Stream the output tokens
                    for line in process.stdout:
                        # Filter out engine metadata from the chat bubble
                        metadata_prefixes = ("llama", "main", "llm_", "llm-", "system_info", "sampler", "generate", "common", "BOS", "EOS", "UNK", "PAD")
                        if any(line.strip().startswith(p) for p in metadata_prefixes) or "..." in line or "init:" in line:
                            log_buffer.append(line)
                            log_placeholder.code("".join(log_buffer[-10:]))
                        else:
                            full_response += line
                            message_placeholder.markdown(full_response + "โ–Œ")
                    
                    status.update(label="โœ… Generation Complete", state="complete")
                    
                    # Check for errors in stderr but don't clutter the chat
                    stderr = process.stderr.read()
                    if stderr:
                        log_buffer.append(f"\nSTDERR: {stderr}")
                        log_placeholder.code("".join(log_buffer[-15:]))
                else:
                    status.update(label="โŒ Failed to launch engine", state="error")
                    full_response = "Failed to launch inference engine."
                    message_placeholder.markdown(full_response)
        else:
            st.error("Model not available.")
            
    st.session_state.messages.append({"role": "assistant", "content": full_response})