import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# 1. Download file GGUF dari repo lu ke mesin Space
# Ganti dengan repo_id lu dan NAMA FILE gguf-nya yang bener
repo_id = "Daffaadityp/AxonAI-MX4-2.0-GGUF" # Ganti kalau reponya beda
nama_file_gguf = "AxonAI-MX4-2.0-Q4_K_M.gguf" # Sesuaikan sama nama file lu

print("Downloading model...")
model_path = hf_hub_download(repo_id=repo_id, filename=nama_file_gguf)

# 2. Load model GGUF ke memori CPU
print("Loading model to CPU...")
llm = Llama(
    model_path=model_path,
    n_ctx=2048,       # Kapasitas ingatan konteks (bisa dikecilin misal 1024 kalau berat)
    n_threads=2       # Disesuaikan sama CPU Basic yang dapet 2 vCPU
)

# 3. Fungsi buat generate jawaban
def prediksi(prompt):
    # Format prompt bisa disesuaikan sama chat template model lu
    hasil = llm(
        f"User: {prompt}\nAxonAI:", # Contoh simple prompt formatting
        max_tokens=150,
        stop=["User:", "\n\n"],
        echo=False
    )
    return hasil['choices'][0]['text'].strip()

# 4. Bikin UI Gradio biar API-nya kebuka
iface = gr.Interface(fn=prediksi, inputs="text", outputs="text")
iface.launch()