owlninjam commited on
Commit
087d8a2
·
verified ·
1 Parent(s): fd6e635

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -17
app.py CHANGED
@@ -1,21 +1,150 @@
1
- import subprocess
 
 
2
  import os
 
3
 
4
- model_path = "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"
 
 
5
 
6
- # Download model if not exists
7
- if not os.path.exists(model_path):
8
- subprocess.run([
9
- "wget",
10
- "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/" + model_path
11
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Run llama.cpp server
14
- subprocess.run([
15
- "./main",
16
- "-m", model_path,
17
- "--host", "0.0.0.0",
18
- "--port", "7860",
19
- "-c", "2048",
20
- "-t", "2"
21
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from llama_cpp import Llama
3
+ import time
4
  import os
5
+ from threading import Lock
6
 
7
+ # Global variables
8
+ model = None
9
+ model_lock = Lock()
10
 
11
+ @st.cache_resource
12
+ def load_model():
13
+ """Load the model once and cache it"""
14
+ model_path = "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"
15
+
16
+ if not os.path.exists(model_path):
17
+ st.error(f"Model file {model_path} not found!")
18
+ return None
19
+
20
+ with st.spinner("Loading model... This may take a few minutes."):
21
+ try:
22
+ llm = Llama(
23
+ model_path=model_path,
24
+ n_ctx=4096, # Context window
25
+ n_threads=2, # Use both CPU cores
26
+ n_batch=512, # Batch size for processing
27
+ verbose=False,
28
+ use_mlock=True, # Keep model in RAM
29
+ n_gpu_layers=0, # CPU only
30
+ )
31
+ return llm
32
+ except Exception as e:
33
+ st.error(f"Error loading model: {str(e)}")
34
+ return None
35
 
36
+ def generate_response(llm, prompt, max_tokens=512, temperature=0.7):
37
+ """Generate response with streaming"""
38
+ try:
39
+ # Format prompt for ChatML format
40
+ formatted_prompt = f"""<|im_start|>system
41
+ You are a helpful AI assistant.
42
+ <|im_end|>
43
+ <|im_start|>user
44
+ {prompt}
45
+ <|im_end|>
46
+ <|im_start|>assistant
47
+ """
48
+
49
+ response_placeholder = st.empty()
50
+ full_response = ""
51
+ start_time = time.time()
52
+ token_count = 0
53
+
54
+ # Generate with streaming
55
+ stream = llm(
56
+ formatted_prompt,
57
+ max_tokens=max_tokens,
58
+ temperature=temperature,
59
+ top_p=0.9,
60
+ stop=["<|im_end|>", "<|im_start|>"],
61
+ stream=True,
62
+ echo=False
63
+ )
64
+
65
+ for output in stream:
66
+ if 'choices' in output and len(output['choices']) > 0:
67
+ token = output['choices'][0].get('text', '')
68
+ full_response += token
69
+ token_count += 1
70
+
71
+ # Update the response in real-time
72
+ response_placeholder.markdown(full_response + "▌")
73
+
74
+ # Final response without cursor
75
+ response_placeholder.markdown(full_response)
76
+
77
+ # Calculate and display stats
78
+ end_time = time.time()
79
+ duration = end_time - start_time
80
+ tokens_per_second = token_count / duration if duration > 0 else 0
81
+
82
+ st.caption(f"Generated {token_count} tokens in {duration:.2f}s ({tokens_per_second:.2f} tokens/sec)")
83
+
84
+ return full_response
85
+
86
+ except Exception as e:
87
+ st.error(f"Error generating response: {str(e)}")
88
+ return None
89
+
90
+ def main():
91
+ st.set_page_config(
92
+ page_title="CapybaraHermes Chat",
93
+ page_icon="🦙",
94
+ layout="wide"
95
+ )
96
+
97
+ st.title("🦙 CapybaraHermes-2.5-Mistral-7B Chat")
98
+ st.markdown("*Quantized model running on CPU*")
99
+
100
+ # Load model
101
+ llm = load_model()
102
+
103
+ if llm is None:
104
+ st.stop()
105
+
106
+ # Sidebar for settings
107
+ with st.sidebar:
108
+ st.header("Settings")
109
+ max_tokens = st.slider("Max Tokens", 50, 1024, 512)
110
+ temperature = st.slider("Temperature", 0.0, 1.0, 0.7, 0.1)
111
+
112
+ st.header("Model Info")
113
+ st.info("""
114
+ **Model:** CapybaraHermes-2.5-Mistral-7B
115
+ **Quantization:** Q5_K_M
116
+ **Size:** ~5GB
117
+ **Expected Speed:** 2-8 tokens/sec
118
+ """)
119
+
120
+ # Chat interface
121
+ if "messages" not in st.session_state:
122
+ st.session_state.messages = []
123
+
124
+ # Display chat history
125
+ for message in st.session_state.messages:
126
+ with st.chat_message(message["role"]):
127
+ st.markdown(message["content"])
128
+
129
+ # Chat input
130
+ if prompt := st.chat_input("What would you like to know?"):
131
+ # Add user message to chat history
132
+ st.session_state.messages.append({"role": "user", "content": prompt})
133
+ with st.chat_message("user"):
134
+ st.markdown(prompt)
135
+
136
+ # Generate and display assistant response
137
+ with st.chat_message("assistant"):
138
+ with model_lock: # Ensure thread safety
139
+ response = generate_response(llm, prompt, max_tokens, temperature)
140
+
141
+ if response:
142
+ st.session_state.messages.append({"role": "assistant", "content": response})
143
+
144
+ # Clear chat button
145
+ if st.button("Clear Chat History"):
146
+ st.session_state.messages = []
147
+ st.rerun()
148
+
149
+ if __name__ == "__main__":
150
+ main()