Spaces:

shuarya2011
/

FASTEST

Build error

File size: 1,395 Bytes

e1914a0
cedfec7
 
94820f0
cedfec7
 
94820f0
e3ec7c7
 
a9995a4
 
 
94820f0
 
cedfec7
 
 
e3ec7c7
94820f0
cedfec7
94820f0
cedfec7
a9995a4
 
 
94820f0
 
e3ec7c7
cedfec7
 
94820f0
e3ec7c7
94820f0
 
e3ec7c7
 
cedfec7
e3ec7c7
e1914a0
6d39b27
d2a3695
 
94820f0
 
d2a3695
e1914a0
 
e3ec7c7

import gradio as gr
from llama_cpp import Llama

# verbose=False stops the model from printing its internal logic to the logs
llm = Llama(
    model_path="llama3.2-1b-Uncensored-Q4_K_M.gguf",
    n_ctx=512,
    n_threads=4,
    verbose=False
)

def stream_chat(message, history):
    # Minimalist prompt to prevent the model from entering "test/quiz" mode
    prompt = f"User: {message}\nAssistant:"

    stream = llm(
        prompt,
        max_tokens=256,
        stop=["User:", "Assistant:", "\n"],
        stream=True,
        temperature=0, # Greedy search: prevents "wandering" into reasoning
        repeat_penalty=1.2
    )

    partial_text = ""
    # Phrases that usually indicate the model is starting a monologue
    forbidden_start = ["Okay", "I think", "First", "Let me", "The user"]

    for output in stream:
        token = output["choices"][0]["text"]
        combined = partial_text + token
        
        # Kill generation if it starts meta-commentary
        if any(combined.startswith(phrase) for phrase in forbidden_start):
            break
            
        partial_text += token
        yield partial_text.strip()

with gr.Blocks() as demo:
    gr.ChatInterface(
        fn=stream_chat,
        title="DIRECT-LLAMA-MAX",
        description="Fast, direct, and uncensored."
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)