Spaces:

Fu01978
/

VoxAI

Running

Fu01978 commited on 25 days ago

Commit

d19644f

verified ·

1 Parent(s): 8dcdd3e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from llama_cpp import Llama
 import os
 # Download and load the GGUF model
@@ -14,13 +14,28 @@ if not os.path.exists(model_path):
     print("Model downloaded!")
 # Load the model
-llm = Llama(
-    model_path=model_path,
-    n_ctx=2048,  # Context window
-    n_threads=4,  # Number of CPU threads
-    n_gpu_layers=0  # Set to -1 to offload all layers to GPU if available
 )
 def chat(message, history):
     """
     Process chat messages and generate responses.
@@ -29,28 +44,19 @@ def chat(message, history):
         message: Current user message
         history: List of [user_msg, bot_msg] pairs
     """
-    # Build conversation with proper Llama format
-    messages = []
-    # Add chat history
-    for user_msg, bot_msg in history:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "assistant", "content": bot_msg})
-    # Add current message
-    messages.append({"role": "user", "content": message})
     # Generate response
-    response = llm.create_chat_completion(
-        messages=messages,
-        max_tokens=512,
         temperature=0.7,
         top_p=0.9,
-        stream=False
     )
-    # Extract the assistant's response
-    return response["choices"][0]["message"]["content"]
 # Create Gradio interface
 demo = gr.ChatInterface(

 import gradio as gr
+from ctransformers import AutoModelForCausalLM
 import os
 # Download and load the GGUF model
     print("Model downloaded!")
 # Load the model
+llm = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    model_type="llama",
+    context_length=2048,
+    gpu_layers=0  # Set higher if GPU available
 )
+def format_prompt(message, history):
+    """Format the conversation into Llama 3.2 chat format"""
+    prompt = ""
+    # Add chat history
+    for user_msg, bot_msg in history:
+        prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|>"
+        prompt += f"<|start_header_id|>assistant<|end_header_id|>\n\n{bot_msg}<|eot_id|>"
+    # Add current message
+    prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|>"
+    prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    return prompt
 def chat(message, history):
     """
     Process chat messages and generate responses.
         message: Current user message
         history: List of [user_msg, bot_msg] pairs
     """
+    # Format the prompt
+    prompt = format_prompt(message, history)
     # Generate response
+    response = llm(
+        prompt,
+        max_new_tokens=512,
         temperature=0.7,
         top_p=0.9,
+        stop=["<|eot_id|>", "<|start_header_id|>"]
     )
+    return response.strip()
 # Create Gradio interface
 demo = gr.ChatInterface(