Spaces:
Sleeping
Sleeping
File size: 4,615 Bytes
c3cedc9 6b22d2a 5c941b2 5d33726 5c941b2 5d33726 452f7e6 c3cedc9 6b22d2a e8dfa38 5d33726 452f7e6 c3cedc9 6b22d2a c3cedc9 3b4f893 c3cedc9 6b22d2a 3dca2d0 6b22d2a 3dca2d0 452f7e6 3b4f893 3dca2d0 6b22d2a 3dca2d0 3b4f893 c3cedc9 57bc1f8 6b22d2a 57bc1f8 6bc4012 d957ea2 c3cedc9 6b22d2a 3b4f893 c3cedc9 3b4f893 c3cedc9 3dca2d0 c3cedc9 3b4f893 6b22d2a c3cedc9 452f7e6 3dca2d0 c3cedc9 452f7e6 c3cedc9 5d33726 452f7e6 6b22d2a 5d33726 c3cedc9 6b22d2a 06d3d55 59bdad1 6b22d2a c3cedc9 6b22d2a c3cedc9 3f922cc c3cedc9 6b22d2a 452f7e6 6b22d2a c3cedc9 6b22d2a c3cedc9 5d33726 452f7e6 6b22d2a 59bdad1 452f7e6 6b22d2a 5d33726 6b22d2a b551a93 6b22d2a 5d33726 452f7e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import os
import subprocess
import sys
import tarfile
import time
import gradio as gr
import requests
from huggingface_hub import hf_hub_download
# Llama-cpp-python, f- you for not updating for months! :(
LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz"
BINARY_NAME = "llama-server"
SERVER_PORT = "8080"
REPO_ID = "huzpsb/MiniMax-M2-her-4b-en-heretic"
FILENAME = "openher_4b_q4_en_heretic.gguf"
def setup_server():
print(f"[*] Downloading model: {FILENAME}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
if not os.path.exists(BINARY_NAME):
print("[*] Downloading llama.cpp binary package...")
response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True)
with open("llama.tar.gz", "wb") as f:
f.write(response.content)
print("[*] Extracting files and handling symlinks...")
with tarfile.open("llama.tar.gz", "r:gz") as tar:
for member in tar.getmembers():
base_name = os.path.basename(member.name)
if not base_name: continue
if member.isfile():
member.name = base_name
tar.extract(member, path=".")
elif member.issym():
link_target = os.path.basename(member.linkname)
if os.path.lexists(base_name):
os.remove(base_name)
try:
os.symlink(link_target, base_name)
print(f"[*] Created symlink: {base_name} -> {link_target}")
except OSError as e:
print(f"[!] Failed to create symlink {base_name}: {e}")
if os.path.exists(BINARY_NAME):
os.chmod(BINARY_NAME, 0o755)
else:
raise Exception(f"Could not find {BINARY_NAME} in the extracted files.")
print("[*] Starting llama-server with LD_LIBRARY_PATH...")
new_env = os.environ.copy()
current_dir = os.getcwd()
new_env["LD_LIBRARY_PATH"] = f"{current_dir}:{new_env.get('LD_LIBRARY_PATH', '')}"
cmd = [
f"./{BINARY_NAME}",
"-m", model_path,
"--port", SERVER_PORT,
"--ctx-size", "8192",
"--n-gpu-layers", "0",
"--host", "127.0.0.1",
"--cache-ram", "0",
"--parallel", "1",
"--no-webui",
"--threads", "2",
]
proc = subprocess.Popen(
cmd,
stdout=sys.stdout,
stderr=sys.stderr,
env=new_env
)
print("[*] Waiting for server to respond...")
retries = 0
while retries < 60:
try:
r = requests.get(f"http://127.0.0.1:{SERVER_PORT}/health")
if r.status_code == 200:
print("[*] Server is ready!")
return proc
except:
time.sleep(2)
retries += 1
raise Exception("Server failed to start. Check logs for missing .so files.")
server_process = setup_server()
def predict(message, history, system_prompt, temperature):
messages = [{"role": "system", "content": system_prompt}]
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
payload = {
"messages": messages,
"temperature": temperature,
"max_tokens": 100,
"top_p": 0.95,
"stream": False
}
print("\n--- [Request Payload] ---")
print(payload)
try:
response = requests.post(
f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions",
json=payload,
timeout=480
)
response.raise_for_status()
result = response.json()
print("--- [Response] ---")
print(result)
return result["choices"][0]["message"]["content"]
except Exception as e:
print(f"--- [Error] --- \n{str(e)}")
return f"Error: {str(e)}"
with gr.Blocks(theme="soft") as demo:
gr.Markdown("Test if we can load load gguf models with llama.cpp's new server API. ")
chat_interface = gr.ChatInterface(
fn=predict,
additional_inputs=[
gr.Textbox(value="You are Alvin, wolf in heat, a wolf-girl.", label="System Prompt"),
gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.1, label="Temperature"),
],
chatbot=gr.Chatbot(height=500),
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|