File size: 677 Bytes
6388a60
d9c2248
c4f569f
d9c2248
c4f569f
 
 
 
 
 
 
6388a60
d9c2248
 
c4f569f
 
 
 
 
 
d9c2248
 
 
6388a60
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os
import gradio as gr
from llama_cpp import Llama

# No cache redirect needed for GGUF
model_path = "Qwen/Qwen2.5-Coder-14B-Instruct-GGUF"  # Auto-downloads Q4_K_M (~9GB)
llm = Llama(
    model_path,
    n_ctx=4096,        # Adjust for coding tasks
    n_gpu_layers=99,   # Offload to T4 GPU
    verbose=False
)

def chat(message, history):
    response = llm.create_chat_completion(
        messages=[{"role": "user", "content": message}],
        max_tokens=512,
        temperature=0.7
    )
    history.append((message, response["choices"][0]["message"]["content"]))
    return history, ""

demo = gr.ChatInterface(chat)
if __name__ == "__main__":
    demo.launch()