Spaces:

willsh1997
/

test-voice-chat

Sleeping

App Files Files Community

willsh1997 commited on Nov 4, 2025

Commit

84a6755

verified ·

1 Parent(s): 7bfaef7

FUCK

Browse files

Files changed (1) hide show

app.py +35 -36

app.py CHANGED Viewed

@@ -66,47 +66,54 @@ from multiprocessing import freeze_support
 import gradio as gr
 import numpy as np
-from vllm import LLM
-@spaces.GPU
-def initialize_model():
-    """Initialize the model - called after proper multiprocessing setup"""
-    llama3_model_id = "shuyuej/Llama-3.2-1B-Instruct-GPTQ"
-    llama3_pipe = LLM(
-        model=llama3_model_id,
-        quantization="gptq",
-        gpu_memory_utilization=0.5,
-        max_model_len=1024
-    )
-    return llama3_pipe
-# Global variable to hold the model
-llama3_pipe = None
-default_sys_prompt = """You are a helpful chatbot. You respond very conversationally, and help the end user as best as you can."""
 @spaces.GPU
 def llama_QA(message_history, system_prompt: str):
     """
-    stupid func for asking llama a question and then getting an answer
     inputs:
-    - input_question [str]: question for llama to answer
     outputs:
     - response [str]: llama's response
     """
     global llama3_pipe
-    # set max gen to 512
-    sampling_params = llama3_pipe.get_default_sampling_params()
-    sampling_params.max_tokens = 512
     input_message_history = [{"role": "system", "content": system_prompt}]
     input_message_history.extend(message_history)
-    outputs = llama3_pipe.chat(input_message_history, sampling_params)[0].outputs[0].text
-    # message_history.append({"role": "assistant", "content": outputs})
-    return outputs
 @dataclass
@@ -257,13 +264,5 @@ def create_demo():
     return demo
-if __name__ == "__main__":
-    freeze_support()  # Add this for Windows compatibility
-    # Initialize the model after freeze_support
-    llama3_pipe = initialize_model()
-    # Create and launch the demo
-    demo = create_demo()
-    demo.launch()

 import gradio as gr
 import numpy as np
+from transformers import pipeline
+import torch
+default_sys_prompt = """You are a helpful chatbot. You respond very conversationally, and help the end user as best as you can."""
+llama3_model_id = "shuyuej/Llama-3.2-1B-Instruct-GPTQ"
+llama3_pipe = pipeline(
+    "text-generation",
+    model=llama3_model_id,
+    device_map="auto",
+    torch_dtype=torch.float16,
+    max_new_tokens=512
+)
 @spaces.GPU
 def llama_QA(message_history, system_prompt: str):
     """
+    Function for asking llama a question and then getting an answer
     inputs:
+    - message_history [list]: conversation history
+    - system_prompt [str]: system prompt for the model
     outputs:
     - response [str]: llama's response
     """
     global llama3_pipe
+    # Lazy initialization - only load model when first called
+    if llama3_pipe is None:
+        llama3_pipe = initialize_model()
+    # Prepare the message history
     input_message_history = [{"role": "system", "content": system_prompt}]
     input_message_history.extend(message_history)
+    # Generate response using pipeline
+    outputs = llama3_pipe(
+        input_message_history,
+        max_new_tokens=512,
+        do_sample=True,
+        temperature=0.7,
+        top_p=0.9
+    )
+    # Extract the response text
+    response = outputs[0]["generated_text"][-1]["content"]
+    return response
 @dataclass
     return demo
+demo = create_demo()
+demo.launch()