File size: 4,615 Bytes
c3cedc9
 
6b22d2a
5c941b2
 
 
5d33726
5c941b2
5d33726
 
452f7e6
 
c3cedc9
 
6b22d2a
e8dfa38
 
5d33726
452f7e6
c3cedc9
6b22d2a
c3cedc9
 
3b4f893
c3cedc9
6b22d2a
 
3dca2d0
6b22d2a
 
3dca2d0
452f7e6
3b4f893
3dca2d0
6b22d2a
3dca2d0
 
 
 
 
 
 
 
 
 
3b4f893
 
 
 
 
 
 
 
 
c3cedc9
 
 
 
57bc1f8
6b22d2a
57bc1f8
6bc4012
d957ea2
 
 
c3cedc9
 
6b22d2a
3b4f893
 
 
c3cedc9
3b4f893
c3cedc9
3dca2d0
c3cedc9
3b4f893
 
 
 
6b22d2a
c3cedc9
 
452f7e6
3dca2d0
c3cedc9
452f7e6
c3cedc9
5d33726
452f7e6
6b22d2a
 
5d33726
 
 
 
c3cedc9
 
6b22d2a
06d3d55
59bdad1
6b22d2a
c3cedc9
6b22d2a
 
c3cedc9
 
 
 
3f922cc
c3cedc9
 
6b22d2a
 
452f7e6
6b22d2a
c3cedc9
6b22d2a
c3cedc9
5d33726
452f7e6
6b22d2a
59bdad1
452f7e6
6b22d2a
5d33726
6b22d2a
b551a93
6b22d2a
 
 
5d33726
 
 
452f7e6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import subprocess
import sys
import tarfile
import time

import gradio as gr
import requests
from huggingface_hub import hf_hub_download

# Llama-cpp-python, f- you for not updating for months! :(

LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz"
BINARY_NAME = "llama-server"
SERVER_PORT = "8080"
REPO_ID = "huzpsb/MiniMax-M2-her-4b-en-heretic"
FILENAME = "openher_4b_q4_en_heretic.gguf"


def setup_server():
    print(f"[*] Downloading model: {FILENAME}...")
    model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
    if not os.path.exists(BINARY_NAME):
        print("[*] Downloading llama.cpp binary package...")
        response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True)
        with open("llama.tar.gz", "wb") as f:
            f.write(response.content)
        print("[*] Extracting files and handling symlinks...")
        with tarfile.open("llama.tar.gz", "r:gz") as tar:
            for member in tar.getmembers():
                base_name = os.path.basename(member.name)
                if not base_name: continue
                if member.isfile():
                    member.name = base_name
                    tar.extract(member, path=".")
                elif member.issym():
                    link_target = os.path.basename(member.linkname)
                    if os.path.lexists(base_name):
                        os.remove(base_name)
                    try:
                        os.symlink(link_target, base_name)
                        print(f"[*] Created symlink: {base_name} -> {link_target}")
                    except OSError as e:
                        print(f"[!] Failed to create symlink {base_name}: {e}")

        if os.path.exists(BINARY_NAME):
            os.chmod(BINARY_NAME, 0o755)
        else:
            raise Exception(f"Could not find {BINARY_NAME} in the extracted files.")

    print("[*] Starting llama-server with LD_LIBRARY_PATH...")
    new_env = os.environ.copy()
    current_dir = os.getcwd()
    new_env["LD_LIBRARY_PATH"] = f"{current_dir}:{new_env.get('LD_LIBRARY_PATH', '')}"
    cmd = [
        f"./{BINARY_NAME}",
        "-m", model_path,
        "--port", SERVER_PORT,
        "--ctx-size", "8192",
        "--n-gpu-layers", "0",
        "--host", "127.0.0.1",
        "--cache-ram", "0",
        "--parallel", "1",
        "--no-webui",
        "--threads", "2",
    ]
    proc = subprocess.Popen(
        cmd,
        stdout=sys.stdout,
        stderr=sys.stderr,
        env=new_env
    )
    print("[*] Waiting for server to respond...")
    retries = 0
    while retries < 60:
        try:
            r = requests.get(f"http://127.0.0.1:{SERVER_PORT}/health")
            if r.status_code == 200:
                print("[*] Server is ready!")
                return proc
        except:
            time.sleep(2)
            retries += 1

    raise Exception("Server failed to start. Check logs for missing .so files.")


server_process = setup_server()


def predict(message, history, system_prompt, temperature):
    messages = [{"role": "system", "content": system_prompt}]
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})
    payload = {
        "messages": messages,
        "temperature": temperature,
        "max_tokens": 100,
        "top_p": 0.95,
        "stream": False
    }
    print("\n--- [Request Payload] ---")
    print(payload)
    try:
        response = requests.post(
            f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions",
            json=payload,
            timeout=480
        )
        response.raise_for_status()
        result = response.json()
        print("--- [Response] ---")
        print(result)
        return result["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"--- [Error] --- \n{str(e)}")
        return f"Error: {str(e)}"


with gr.Blocks(theme="soft") as demo:
    gr.Markdown("Test if we can load load gguf models with llama.cpp's new server API. ")

    chat_interface = gr.ChatInterface(
        fn=predict,
        additional_inputs=[
            gr.Textbox(value="You are Alvin, wolf in heat, a wolf-girl.", label="System Prompt"),
            gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.1, label="Temperature"),
        ],
        chatbot=gr.Chatbot(height=500),
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)