Spaces:

N-Bot-Int
/

OpenElla-GGUF

Sleeping

App Files Files Community

ItsMeDevRoland commited on Mar 26, 2025

Commit

8317786

verified ·

1 Parent(s): d2a9ed5

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -23

app.py CHANGED Viewed

@@ -30,13 +30,12 @@ for package in REQUIRED_PACKAGES:
 import gradio as gr
 import torch
 from huggingface_hub import hf_hub_download
-from transformers import AutoTokenizer
 import os
 # Efficient GGUF model download and loading
 def download_and_load_model(
-    repo_id="N-Bot-Int/OpenElla3-Llama3.2B-GGUF",
-    filename="unsloth.Q4_K_M.gguf"
 ):
     """
     Download GGUF model from HuggingFace if not exists
@@ -138,27 +137,37 @@ def respond(
         str: Streaming response
     """
     # Prepare the full prompt with system message and history
-    full_messages = [{"role": "system", "content": system_message}]
-    full_messages.extend(format_history(history))
-    full_messages.append({"role": "user", "content": message})
-    # Prepare the prompt string for the model
-    prompt = message
     # Generate response with streaming
     response = ""
-    for chunk in llm_model.generate(
-        prompt,
-        max_tokens=max_tokens,
-        stop=[],  # You can add stop sequences if needed
-        temperature=temperature,
-        top_p=top_p,
-        stream=True
-    ):
-        response += chunk
-        yield response
-# Create Gradio interface
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
@@ -173,6 +182,8 @@ demo = gr.ChatInterface(
             label="Top-p (nucleus sampling)",
         ),
     ],
 )
 if __name__ == "__main__":
@@ -180,10 +191,8 @@ if __name__ == "__main__":
     print(f"Available CPU threads: {torch.get_num_threads()}")
     print(f"Model path: {MODEL_PATH}")
-    # Launch the Gradio interface
     demo.launch(
-        # Optional optimization settings
         show_api=False,  # Disable API endpoint
-        enable_queue=True,  # Enable request queuing
-        max_threads=max(torch.get_num_threads() // 2, 1)  # Limit threads
     )

 import gradio as gr
 import torch
 from huggingface_hub import hf_hub_download
 import os
 # Efficient GGUF model download and loading
 def download_and_load_model(
+    repo_id="HuggingFaceH4/zephyr-7b-beta",
+    filename="zephyr-7b-beta.Q4_K_M.gguf"
 ):
     """
     Download GGUF model from HuggingFace if not exists
         str: Streaming response
     """
     # Prepare the full prompt with system message and history
+    full_prompt = system_message + "\n\n"
+    # Add chat history
+    for user, assistant in history:
+        if user:
+            full_prompt += f"User: {user}\n"
+        if assistant:
+            full_prompt += f"Assistant: {assistant}\n"
+    # Add current message
+    full_prompt += f"User: {message}\n"
+    full_prompt += "Assistant: "
     # Generate response with streaming
     response = ""
+    try:
+        for chunk in llm_model.generate(
+            full_prompt,
+            max_tokens=max_tokens,
+            stop=["User:", "\n"],  # Stop on new user input
+            temperature=temperature,
+            top_p=top_p,
+            stream=True
+        ):
+            response += chunk
+            yield response
+    except Exception as e:
+        print(f"Error generating response: {e}")
+        yield f"An error occurred: {e}"
+# Create Gradio interface with updated configuration
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
             label="Top-p (nucleus sampling)",
         ),
     ],
+    # Explicitly set chatbot type to messages
+    chatbot=gr.Chatbot(type="messages")
 )
 if __name__ == "__main__":
     print(f"Available CPU threads: {torch.get_num_threads()}")
     print(f"Model path: {MODEL_PATH}")
+    # Launch the Gradio interface with compatible parameters
     demo.launch(
         show_api=False,  # Disable API endpoint
+        share=False     # Do not create public URL
     )