import gradio as gr
from huggingface_hub import hf_hub_download
import subprocess
import os

# Download our GGUF model
model_path = hf_hub_download(
    repo_id="Genie-AI-Lab/Omni-Genie",
    filename="Qwen2.5-3B-Instruct.Q4_0.gguf"
)

def chat(message, history):
    # Use llama.cpp to run inference on our model
    result = subprocess.run([
        "python", "-m", "llama_cpp.server",
        "--model", model_path,
        "--n_ctx", "2048"
    ], input=message.encode(), capture_output=True)
    
    return result.stdout.decode()

gr.ChatInterface(chat).launch()