test / app.py
huzpsb's picture
Update app.py
d957ea2 verified
raw
history blame
4.62 kB
import os
import subprocess
import sys
import tarfile
import time
import gradio as gr
import requests
from huggingface_hub import hf_hub_download
# Llama-cpp-python, f- you for not updating for months! :(
LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz"
BINARY_NAME = "llama-server"
SERVER_PORT = "8080"
REPO_ID = "huzpsb/MiniMax-M2-her-4b-en-heretic"
FILENAME = "openher_4b_q4_en_heretic.gguf"
def setup_server():
print(f"[*] Downloading model: {FILENAME}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
if not os.path.exists(BINARY_NAME):
print("[*] Downloading llama.cpp binary package...")
response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True)
with open("llama.tar.gz", "wb") as f:
f.write(response.content)
print("[*] Extracting files and handling symlinks...")
with tarfile.open("llama.tar.gz", "r:gz") as tar:
for member in tar.getmembers():
base_name = os.path.basename(member.name)
if not base_name: continue
if member.isfile():
member.name = base_name
tar.extract(member, path=".")
elif member.issym():
link_target = os.path.basename(member.linkname)
if os.path.lexists(base_name):
os.remove(base_name)
try:
os.symlink(link_target, base_name)
print(f"[*] Created symlink: {base_name} -> {link_target}")
except OSError as e:
print(f"[!] Failed to create symlink {base_name}: {e}")
if os.path.exists(BINARY_NAME):
os.chmod(BINARY_NAME, 0o755)
else:
raise Exception(f"Could not find {BINARY_NAME} in the extracted files.")
print("[*] Starting llama-server with LD_LIBRARY_PATH...")
new_env = os.environ.copy()
current_dir = os.getcwd()
new_env["LD_LIBRARY_PATH"] = f"{current_dir}:{new_env.get('LD_LIBRARY_PATH', '')}"
cmd = [
f"./{BINARY_NAME}",
"-m", model_path,
"--port", SERVER_PORT,
"--ctx-size", "8192",
"--n-gpu-layers", "0",
"--host", "127.0.0.1",
"--cache-ram", "0",
"--parallel", "1",
"--no-webui",
"--threads", "2",
]
proc = subprocess.Popen(
cmd,
stdout=sys.stdout,
stderr=sys.stderr,
env=new_env
)
print("[*] Waiting for server to respond...")
retries = 0
while retries < 60:
try:
r = requests.get(f"http://127.0.0.1:{SERVER_PORT}/health")
if r.status_code == 200:
print("[*] Server is ready!")
return proc
except:
time.sleep(2)
retries += 1
raise Exception("Server failed to start. Check logs for missing .so files.")
server_process = setup_server()
def predict(message, history, system_prompt, temperature):
messages = [{"role": "system", "content": system_prompt}]
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
payload = {
"messages": messages,
"temperature": temperature,
"max_tokens": 100,
"top_p": 0.95,
"stream": False
}
print("\n--- [Request Payload] ---")
print(payload)
try:
response = requests.post(
f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions",
json=payload,
timeout=480
)
response.raise_for_status()
result = response.json()
print("--- [Response] ---")
print(result)
return result["choices"][0]["message"]["content"]
except Exception as e:
print(f"--- [Error] --- \n{str(e)}")
return f"Error: {str(e)}"
with gr.Blocks(theme="soft") as demo:
gr.Markdown("Test if we can load load gguf models with llama.cpp's new server API. ")
chat_interface = gr.ChatInterface(
fn=predict,
additional_inputs=[
gr.Textbox(value="You are Alvin, wolf in heat, a wolf-girl.", label="System Prompt"),
gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.1, label="Temperature"),
],
chatbot=gr.Chatbot(height=500),
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)