Spaces:
Sleeping
Sleeping
| import os | |
| import subprocess | |
| import sys | |
| import tarfile | |
| import time | |
| import gradio as gr | |
| import requests | |
| from huggingface_hub import hf_hub_download | |
| # Llama-cpp-python, f- you for not updating for months! :( | |
| LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz" | |
| BINARY_NAME = "llama-server" | |
| SERVER_PORT = "8080" | |
| REPO_ID = "huzpsb/MiniMax-M2-her-4b-en-heretic" | |
| FILENAME = "openher_4b_q4_en_heretic.gguf" | |
| def setup_server(): | |
| print(f"[*] Downloading model: {FILENAME}...") | |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) | |
| if not os.path.exists(BINARY_NAME): | |
| print("[*] Downloading llama.cpp binary package...") | |
| response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True) | |
| with open("llama.tar.gz", "wb") as f: | |
| f.write(response.content) | |
| print("[*] Extracting files and handling symlinks...") | |
| with tarfile.open("llama.tar.gz", "r:gz") as tar: | |
| for member in tar.getmembers(): | |
| base_name = os.path.basename(member.name) | |
| if not base_name: continue | |
| if member.isfile(): | |
| member.name = base_name | |
| tar.extract(member, path=".") | |
| elif member.issym(): | |
| link_target = os.path.basename(member.linkname) | |
| if os.path.lexists(base_name): | |
| os.remove(base_name) | |
| try: | |
| os.symlink(link_target, base_name) | |
| print(f"[*] Created symlink: {base_name} -> {link_target}") | |
| except OSError as e: | |
| print(f"[!] Failed to create symlink {base_name}: {e}") | |
| if os.path.exists(BINARY_NAME): | |
| os.chmod(BINARY_NAME, 0o755) | |
| else: | |
| raise Exception(f"Could not find {BINARY_NAME} in the extracted files.") | |
| print("[*] Starting llama-server with LD_LIBRARY_PATH...") | |
| new_env = os.environ.copy() | |
| current_dir = os.getcwd() | |
| new_env["LD_LIBRARY_PATH"] = f"{current_dir}:{new_env.get('LD_LIBRARY_PATH', '')}" | |
| cmd = [ | |
| f"./{BINARY_NAME}", | |
| "-m", model_path, | |
| "--port", SERVER_PORT, | |
| "--ctx-size", "8192", | |
| "--n-gpu-layers", "0", | |
| "--host", "127.0.0.1", | |
| "--cache-ram", "0", | |
| "--parallel", "1", | |
| "--no-webui", | |
| "--threads", "2", | |
| ] | |
| proc = subprocess.Popen( | |
| cmd, | |
| stdout=sys.stdout, | |
| stderr=sys.stderr, | |
| env=new_env | |
| ) | |
| print("[*] Waiting for server to respond...") | |
| retries = 0 | |
| while retries < 60: | |
| try: | |
| r = requests.get(f"http://127.0.0.1:{SERVER_PORT}/health") | |
| if r.status_code == 200: | |
| print("[*] Server is ready!") | |
| return proc | |
| except: | |
| time.sleep(2) | |
| retries += 1 | |
| raise Exception("Server failed to start. Check logs for missing .so files.") | |
| server_process = setup_server() | |
| def predict(message, history, system_prompt, temperature): | |
| messages = [{"role": "system", "content": system_prompt}] | |
| for user_msg, bot_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| messages.append({"role": "assistant", "content": bot_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| payload = { | |
| "messages": messages, | |
| "temperature": temperature, | |
| "max_tokens": 100, | |
| "top_p": 0.95, | |
| "stream": False | |
| } | |
| print("\n--- [Request Payload] ---") | |
| print(payload) | |
| try: | |
| response = requests.post( | |
| f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions", | |
| json=payload, | |
| timeout=480 | |
| ) | |
| response.raise_for_status() | |
| result = response.json() | |
| print("--- [Response] ---") | |
| print(result) | |
| return result["choices"][0]["message"]["content"] | |
| except Exception as e: | |
| print(f"--- [Error] --- \n{str(e)}") | |
| return f"Error: {str(e)}" | |
| with gr.Blocks(theme="soft") as demo: | |
| gr.Markdown("Test if we can load load gguf models with llama.cpp's new server API. ") | |
| chat_interface = gr.ChatInterface( | |
| fn=predict, | |
| additional_inputs=[ | |
| gr.Textbox(value="You are Alvin, wolf in heat, a wolf-girl.", label="System Prompt"), | |
| gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.1, label="Temperature"), | |
| ], | |
| chatbot=gr.Chatbot(height=500), | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |