Spaces:

owlninjam
/

spacecx

Paused

App Files Files Community

owlninjam commited on Aug 7, 2025

Commit

087d8a2

verified ·

1 Parent(s): fd6e635

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -17

app.py CHANGED Viewed

@@ -1,21 +1,150 @@
-import subprocess
 import os
-model_path = "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"
-# Download model if not exists
-if not os.path.exists(model_path):
-    subprocess.run([
-        "wget",
-        "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/" + model_path
-    ])
-# Run llama.cpp server
-subprocess.run([
-    "./main",
-    "-m", model_path,
-    "--host", "0.0.0.0",
-    "--port", "7860",
-    "-c", "2048",
-    "-t", "2"
-])

+import streamlit as st
+from llama_cpp import Llama
+import time
 import os
+from threading import Lock
+# Global variables
+model = None
+model_lock = Lock()
+@st.cache_resource
+def load_model():
+    """Load the model once and cache it"""
+    model_path = "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"
+    if not os.path.exists(model_path):
+        st.error(f"Model file {model_path} not found!")
+        return None
+    with st.spinner("Loading model... This may take a few minutes."):
+        try:
+            llm = Llama(
+                model_path=model_path,
+                n_ctx=4096,  # Context window
+                n_threads=2,  # Use both CPU cores
+                n_batch=512,  # Batch size for processing
+                verbose=False,
+                use_mlock=True,  # Keep model in RAM
+                n_gpu_layers=0,  # CPU only
+            )
+            return llm
+        except Exception as e:
+            st.error(f"Error loading model: {str(e)}")
+            return None
+def generate_response(llm, prompt, max_tokens=512, temperature=0.7):
+    """Generate response with streaming"""
+    try:
+        # Format prompt for ChatML format
+        formatted_prompt = f"""<|im_start|>system
+You are a helpful AI assistant.
+<|im_end|>
+<|im_start|>user
+{prompt}
+<|im_end|>
+<|im_start|>assistant
+"""
+        response_placeholder = st.empty()
+        full_response = ""
+        start_time = time.time()
+        token_count = 0
+        # Generate with streaming
+        stream = llm(
+            formatted_prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=0.9,
+            stop=["<|im_end|>", "<|im_start|>"],
+            stream=True,
+            echo=False
+        )
+        for output in stream:
+            if 'choices' in output and len(output['choices']) > 0:
+                token = output['choices'][0].get('text', '')
+                full_response += token
+                token_count += 1
+                # Update the response in real-time
+                response_placeholder.markdown(full_response + "▌")
+        # Final response without cursor
+        response_placeholder.markdown(full_response)
+        # Calculate and display stats
+        end_time = time.time()
+        duration = end_time - start_time
+        tokens_per_second = token_count / duration if duration > 0 else 0
+        st.caption(f"Generated {token_count} tokens in {duration:.2f}s ({tokens_per_second:.2f} tokens/sec)")
+        return full_response
+    except Exception as e:
+        st.error(f"Error generating response: {str(e)}")
+        return None
+def main():
+    st.set_page_config(
+        page_title="CapybaraHermes Chat",
+        page_icon="🦙",
+        layout="wide"
+    )
+    st.title("🦙 CapybaraHermes-2.5-Mistral-7B Chat")
+    st.markdown("*Quantized model running on CPU*")
+    # Load model
+    llm = load_model()
+    if llm is None:
+        st.stop()
+    # Sidebar for settings
+    with st.sidebar:
+        st.header("Settings")
+        max_tokens = st.slider("Max Tokens", 50, 1024, 512)
+        temperature = st.slider("Temperature", 0.0, 1.0, 0.7, 0.1)
+        st.header("Model Info")
+        st.info("""
+        **Model:** CapybaraHermes-2.5-Mistral-7B
+        **Quantization:** Q5_K_M
+        **Size:** ~5GB
+        **Expected Speed:** 2-8 tokens/sec
+        """)
+    # Chat interface
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat input
+    if prompt := st.chat_input("What would you like to know?"):
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Generate and display assistant response
+        with st.chat_message("assistant"):
+            with model_lock:  # Ensure thread safety
+                response = generate_response(llm, prompt, max_tokens, temperature)
+            if response:
+                st.session_state.messages.append({"role": "assistant", "content": response})
+    # Clear chat button
+    if st.button("Clear Chat History"):
+        st.session_state.messages = []
+        st.rerun()
+if __name__ == "__main__":
+    main()