import os import subprocess import sys import tarfile import time import gradio as gr import requests from huggingface_hub import hf_hub_download # Llama-cpp-python, f- you for not updating for months! :( LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz" BINARY_NAME = "llama-server" SERVER_PORT = "8080" REPO_ID = "huzpsb/MiniMax-M2-her-4b-en-heretic" FILENAME = "openher_4b_q4_en_heretic.gguf" def setup_server(): print(f"[*] Downloading model: {FILENAME}...") model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) if not os.path.exists(BINARY_NAME): print("[*] Downloading llama.cpp binary package...") response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True) with open("llama.tar.gz", "wb") as f: f.write(response.content) print("[*] Extracting files and handling symlinks...") with tarfile.open("llama.tar.gz", "r:gz") as tar: for member in tar.getmembers(): base_name = os.path.basename(member.name) if not base_name: continue if member.isfile(): member.name = base_name tar.extract(member, path=".") elif member.issym(): link_target = os.path.basename(member.linkname) if os.path.lexists(base_name): os.remove(base_name) try: os.symlink(link_target, base_name) print(f"[*] Created symlink: {base_name} -> {link_target}") except OSError as e: print(f"[!] Failed to create symlink {base_name}: {e}") if os.path.exists(BINARY_NAME): os.chmod(BINARY_NAME, 0o755) else: raise Exception(f"Could not find {BINARY_NAME} in the extracted files.") print("[*] Starting llama-server with LD_LIBRARY_PATH...") new_env = os.environ.copy() current_dir = os.getcwd() new_env["LD_LIBRARY_PATH"] = f"{current_dir}:{new_env.get('LD_LIBRARY_PATH', '')}" cmd = [ f"./{BINARY_NAME}", "-m", model_path, "--port", SERVER_PORT, "--ctx-size", "8192", "--n-gpu-layers", "0", "--host", "127.0.0.1", "--cache-ram", "0", "--parallel", "1", "--no-webui", "--threads", "2", ] proc = subprocess.Popen( cmd, stdout=sys.stdout, stderr=sys.stderr, env=new_env ) print("[*] Waiting for server to respond...") retries = 0 while retries < 60: try: r = requests.get(f"http://127.0.0.1:{SERVER_PORT}/health") if r.status_code == 200: print("[*] Server is ready!") return proc except: time.sleep(2) retries += 1 raise Exception("Server failed to start. Check logs for missing .so files.") server_process = setup_server() def predict(message, history, system_prompt, temperature): messages = [{"role": "system", "content": system_prompt}] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) payload = { "messages": messages, "temperature": temperature, "max_tokens": 100, "top_p": 0.95, "stream": False } print("\n--- [Request Payload] ---") print(payload) try: response = requests.post( f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions", json=payload, timeout=480 ) response.raise_for_status() result = response.json() print("--- [Response] ---") print(result) return result["choices"][0]["message"]["content"] except Exception as e: print(f"--- [Error] --- \n{str(e)}") return f"Error: {str(e)}" with gr.Blocks(theme="soft") as demo: gr.Markdown("Test if we can load load gguf models with llama.cpp's new server API. ") chat_interface = gr.ChatInterface( fn=predict, additional_inputs=[ gr.Textbox(value="You are Alvin, wolf in heat, a wolf-girl.", label="System Prompt"), gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.1, label="Temperature"), ], chatbot=gr.Chatbot(height=500), ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)