import os
import subprocess
import sys
import tarfile
import time

import gradio as gr
import requests
from huggingface_hub import hf_hub_download

# Llama-cpp-python, f- you for not updating for months! :(

LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz"
BINARY_NAME = "llama-server"
SERVER_PORT = "8080"
REPO_ID = "huzpsb/MiniMax-M2-her-4b-en-heretic"
FILENAME = "openher_4b_q4_en_heretic.gguf"


def setup_server():
    print(f"[*] Downloading model: {FILENAME}...")
    model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
    if not os.path.exists(BINARY_NAME):
        print("[*] Downloading llama.cpp binary package...")
        response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True)
        with open("llama.tar.gz", "wb") as f:
            f.write(response.content)
        print("[*] Extracting files and handling symlinks...")
        with tarfile.open("llama.tar.gz", "r:gz") as tar:
            for member in tar.getmembers():
                base_name = os.path.basename(member.name)
                if not base_name: continue
                if member.isfile():
                    member.name = base_name
                    tar.extract(member, path=".")
                elif member.issym():
                    link_target = os.path.basename(member.linkname)
                    if os.path.lexists(base_name):
                        os.remove(base_name)
                    try:
                        os.symlink(link_target, base_name)
                        print(f"[*] Created symlink: {base_name} -> {link_target}")
                    except OSError as e:
                        print(f"[!] Failed to create symlink {base_name}: {e}")

        if os.path.exists(BINARY_NAME):
            os.chmod(BINARY_NAME, 0o755)
        else:
            raise Exception(f"Could not find {BINARY_NAME} in the extracted files.")

    print("[*] Starting llama-server with LD_LIBRARY_PATH...")
    new_env = os.environ.copy()
    current_dir = os.getcwd()
    new_env["LD_LIBRARY_PATH"] = f"{current_dir}:{new_env.get('LD_LIBRARY_PATH', '')}"
    cmd = [
        f"./{BINARY_NAME}",
        "-m", model_path,
        "--port", SERVER_PORT,
        "--ctx-size", "8192",
        "--n-gpu-layers", "0",
        "--host", "127.0.0.1",
        "--cache-ram", "0",
        "--parallel", "1",
        "--no-webui",
        "--threads", "2",
    ]
    proc = subprocess.Popen(
        cmd,
        stdout=sys.stdout,
        stderr=sys.stderr,
        env=new_env
    )
    print("[*] Waiting for server to respond...")
    retries = 0
    while retries < 60:
        try:
            r = requests.get(f"http://127.0.0.1:{SERVER_PORT}/health")
            if r.status_code == 200:
                print("[*] Server is ready!")
                return proc
        except:
            time.sleep(2)
            retries += 1

    raise Exception("Server failed to start. Check logs for missing .so files.")


server_process = setup_server()


def predict(message, history, system_prompt, temperature):
    messages = [{"role": "system", "content": system_prompt}]
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})
    payload = {
        "messages": messages,
        "temperature": temperature,
        "max_tokens": 100,
        "top_p": 0.95,
        "stream": False
    }
    print("\n--- [Request Payload] ---")
    print(payload)
    try:
        response = requests.post(
            f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions",
            json=payload,
            timeout=480
        )
        response.raise_for_status()
        result = response.json()
        print("--- [Response] ---")
        print(result)
        return result["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"--- [Error] --- \n{str(e)}")
        return f"Error: {str(e)}"


with gr.Blocks(theme="soft") as demo:
    gr.Markdown("Test if we can load load gguf models with llama.cpp's new server API. ")

    chat_interface = gr.ChatInterface(
        fn=predict,
        additional_inputs=[
            gr.Textbox(value="You are Alvin, wolf in heat, a wolf-girl.", label="System Prompt"),
            gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.1, label="Temperature"),
        ],
        chatbot=gr.Chatbot(height=500),
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)