import gradio as gr from huggingface_hub import hf_hub_download import subprocess import os # Download our GGUF model model_path = hf_hub_download( repo_id="Genie-AI-Lab/Omni-Genie", filename="Qwen2.5-3B-Instruct.Q4_0.gguf" ) def chat(message, history): # Use llama.cpp to run inference on our model result = subprocess.run([ "python", "-m", "llama_cpp.server", "--model", model_path, "--n_ctx", "2048" ], input=message.encode(), capture_output=True) return result.stdout.decode() gr.ChatInterface(chat).launch()