from llama_cpp import Llama import gradio as gr # Load your quantized GGUF model llm = Llama(model_path="/app/models/qwen2.5-1.5B-q4.gguf") def generate_text(prompt): output = llm(prompt, max_tokens=200) return output['choices'][0]['text'] demo = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=2, placeholder="Type your prompt here..."), outputs="text" ) demo.launch()