fast / app.py
shuarya2011's picture
Update app.py
8d9e69c verified
import gradio as gr
from llama_cpp import Llama
# Initialize the model
# We set n_threads=2 to match the Free Tier vCPU allocation
# n_gpu_layers=0 ensures we don't look for a non-existent GPU
llm = Llama.from_pretrained(
repo_id="mradermacher/llama3.2-1b-Uncensored-GGUF",
filename="llama3.2-1b-Uncensored.Q4_K_M.gguf",
n_ctx=2048,
n_threads=2,
n_gpu_layers=0,
verbose=False
)
def stream_chat(message, history):
# Prepare the prompt template
prompt = f"User: {message}\nAssistant: "
# Create the generation stream
stream = llm(
prompt,
max_tokens=512,
stop=["User:", "\n"],
stream=True, # Enable token-by-token output
temperature=0.8,
top_p=0.95
)
partial_text = ""
for chunk in stream:
# Extract the new token text
new_token = chunk['choices'][0]['text']
partial_text += new_token
# Yielding the string updates the Gradio UI in real-time
yield partial_text
# Set up the Gradio interface
demo = gr.ChatInterface(
fn=stream_chat,
title="Llama 3.2 1B Uncensored",
description="Smart, uncensored, and fast word-by-word streaming on CPU."
)
if __name__ == "__main__":
demo.launch()