ItsMeDevRoland commited on
Commit
8317786
·
verified ·
1 Parent(s): d2a9ed5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -23
app.py CHANGED
@@ -30,13 +30,12 @@ for package in REQUIRED_PACKAGES:
30
  import gradio as gr
31
  import torch
32
  from huggingface_hub import hf_hub_download
33
- from transformers import AutoTokenizer
34
  import os
35
 
36
  # Efficient GGUF model download and loading
37
  def download_and_load_model(
38
- repo_id="N-Bot-Int/OpenElla3-Llama3.2B-GGUF",
39
- filename="unsloth.Q4_K_M.gguf"
40
  ):
41
  """
42
  Download GGUF model from HuggingFace if not exists
@@ -138,27 +137,37 @@ def respond(
138
  str: Streaming response
139
  """
140
  # Prepare the full prompt with system message and history
141
- full_messages = [{"role": "system", "content": system_message}]
142
- full_messages.extend(format_history(history))
143
- full_messages.append({"role": "user", "content": message})
144
 
145
- # Prepare the prompt string for the model
146
- prompt = message
 
 
 
 
 
 
 
 
147
 
148
  # Generate response with streaming
149
  response = ""
150
- for chunk in llm_model.generate(
151
- prompt,
152
- max_tokens=max_tokens,
153
- stop=[], # You can add stop sequences if needed
154
- temperature=temperature,
155
- top_p=top_p,
156
- stream=True
157
- ):
158
- response += chunk
159
- yield response
 
 
 
 
160
 
161
- # Create Gradio interface
162
  demo = gr.ChatInterface(
163
  respond,
164
  additional_inputs=[
@@ -173,6 +182,8 @@ demo = gr.ChatInterface(
173
  label="Top-p (nucleus sampling)",
174
  ),
175
  ],
 
 
176
  )
177
 
178
  if __name__ == "__main__":
@@ -180,10 +191,8 @@ if __name__ == "__main__":
180
  print(f"Available CPU threads: {torch.get_num_threads()}")
181
  print(f"Model path: {MODEL_PATH}")
182
 
183
- # Launch the Gradio interface
184
  demo.launch(
185
- # Optional optimization settings
186
  show_api=False, # Disable API endpoint
187
- enable_queue=True, # Enable request queuing
188
- max_threads=max(torch.get_num_threads() // 2, 1) # Limit threads
189
  )
 
30
  import gradio as gr
31
  import torch
32
  from huggingface_hub import hf_hub_download
 
33
  import os
34
 
35
  # Efficient GGUF model download and loading
36
  def download_and_load_model(
37
+ repo_id="HuggingFaceH4/zephyr-7b-beta",
38
+ filename="zephyr-7b-beta.Q4_K_M.gguf"
39
  ):
40
  """
41
  Download GGUF model from HuggingFace if not exists
 
137
  str: Streaming response
138
  """
139
  # Prepare the full prompt with system message and history
140
+ full_prompt = system_message + "\n\n"
 
 
141
 
142
+ # Add chat history
143
+ for user, assistant in history:
144
+ if user:
145
+ full_prompt += f"User: {user}\n"
146
+ if assistant:
147
+ full_prompt += f"Assistant: {assistant}\n"
148
+
149
+ # Add current message
150
+ full_prompt += f"User: {message}\n"
151
+ full_prompt += "Assistant: "
152
 
153
  # Generate response with streaming
154
  response = ""
155
+ try:
156
+ for chunk in llm_model.generate(
157
+ full_prompt,
158
+ max_tokens=max_tokens,
159
+ stop=["User:", "\n"], # Stop on new user input
160
+ temperature=temperature,
161
+ top_p=top_p,
162
+ stream=True
163
+ ):
164
+ response += chunk
165
+ yield response
166
+ except Exception as e:
167
+ print(f"Error generating response: {e}")
168
+ yield f"An error occurred: {e}"
169
 
170
+ # Create Gradio interface with updated configuration
171
  demo = gr.ChatInterface(
172
  respond,
173
  additional_inputs=[
 
182
  label="Top-p (nucleus sampling)",
183
  ),
184
  ],
185
+ # Explicitly set chatbot type to messages
186
+ chatbot=gr.Chatbot(type="messages")
187
  )
188
 
189
  if __name__ == "__main__":
 
191
  print(f"Available CPU threads: {torch.get_num_threads()}")
192
  print(f"Model path: {MODEL_PATH}")
193
 
194
+ # Launch the Gradio interface with compatible parameters
195
  demo.launch(
 
196
  show_api=False, # Disable API endpoint
197
+ share=False # Do not create public URL
 
198
  )