Spaces:

huzpsb
/

test

Sleeping

App Files Files

huzpsb commited on Feb 18

Commit

c3cedc9

verified ·

1 Parent(s): 0f769f5

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -21

app.py CHANGED Viewed

@@ -1,37 +1,117 @@
 import gradio as gr
 from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
-model_path = hf_hub_download(
-    repo_id="huzpsb/heru",
-    filename="qwq_q4k.gguf"
-)
-print(f"Loading: {model_path} ...")
-llm = Llama(
-    model_path=model_path,
-    n_ctx=81920,
-    n_gpu_layers=0,
-    verbose=True
-)
 def predict(message, history):
     messages = []
     for user_msg, bot_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
-    response = llm.create_chat_completion(
-        messages=messages,
-        stream=False,
-        temperature=0.7,
-        max_tokens=2048
-    )
-    return response["choices"][0]["message"]["content"]
-with gr.Blocks(title="Llama-CPP Inference") as demo:
     gr.ChatInterface(
         fn=predict,
         chatbot=gr.Chatbot(height=600),
@@ -43,4 +123,4 @@ with gr.Blocks(title="Llama-CPP Inference") as demo:
     )
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

+import os
+import time
+import subprocess
+import requests
+import tarfile
 import gradio as gr
 from huggingface_hub import hf_hub_download
+# --- 配置 ---
+LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz"
+BINARY_NAME = "llama-server"
+SERVER_PORT = "8080"  # llama-server 内部运行端口
+REPO_ID = "huzpsb/heru"
+FILENAME = "qwen3p_q4k.gguf"
+def setup_server():
+    """下载并启动 llama-server"""
+    # 1. 下载模型 (如果不存在)
+    print(f"Downloading model: {FILENAME}...")
+    model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
+    print(f"Model ready at: {model_path}")
+    # 2. 下载并解压 llama.cpp binary
+    if not os.path.exists(BINARY_NAME):
+        print("Downloading llama.cpp binary...")
+        response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True)
+        if response.status_code == 200:
+            with open("llama.tar.gz", "wb") as f:
+                f.write(response.content)
+            print("Extracting binary...")
+            with tarfile.open("llama.tar.gz", "r:gz") as tar:
+                # 扁平化解压：找到 build/bin/llama-server 并提取到当前目录
+                for member in tar.getmembers():
+                    if member.name.endswith(BINARY_NAME):
+                        member.name = BINARY_NAME # 重命名以便直接放在根目录
+                        tar.extract(member, path=".")
+                        break
+            # 赋予执行权限
+            os.chmod(BINARY_NAME, 0o755)
+        else:
+            raise Exception("Failed to download llama.cpp binary")
+    # 3. 启动后台进程
+    print("Starting llama-server...")
+    cmd = [
+        f"./{BINARY_NAME}",
+        "-m", model_path,
+        "--port", SERVER_PORT,
+        "--ctx-size", "8192",  # 根据你的 Space 硬件调整上下文
+        "--n-gpu-layers", "0", # CPU Space 设为 0，如有 GPU 可设为 99
+        "--host", "127.0.0.1"
+    ]
+    # 使用 Popen 不阻塞主线程
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True
+    )
+    # 4. 等待服务就绪 (健康检查)
+    print("Waiting for server to act up...")
+    retries = 0
+    while retries < 30:
+        try:
+            requests.get(f"http://127.0.0.1:{SERVER_PORT}/health")
+            print("Server is ready!")
+            return proc
+        except requests.exceptions.ConnectionError:
+            time.sleep(2)
+            retries += 1
+            print(f"Waiting for server... ({retries}/30)")
+    raise Exception("Server failed to start. Check logs.")
+# --- 初始化服务 ---
+# 注意：在 HF Spaces 中，Global 作用域的代码会在启动时运行
+server_process = setup_server()
 def predict(message, history):
+    """Gradio 回调：转发请求给本地 llama-server"""
+    # 构造 OpenAI 格式的 Messages
     messages = []
     for user_msg, bot_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
+    payload = {
+        "messages": messages,
+        "temperature": 0.7,
+        "max_tokens": 2048,
+        "stream": False # 如果需要流式输出，需要改写 requests 处理
+    }
+    try:
+        response = requests.post(
+            f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions",
+            json=payload,
+            headers={"Content-Type": "application/json"}
+        )
+        response.raise_for_status()
+        return response.json()["choices"][0]["message"]["content"]
+    except Exception as e:
+        return f"Error: {str(e)}"
+# --- Gradio UI ---
+with gr.Blocks(title="Qwen3 Llama-CPP Inference") as demo:
+    gr.Markdown(f"### Running Qwen3 via llama-server (b8093)")
     gr.ChatInterface(
         fn=predict,
         chatbot=gr.Chatbot(height=600),
     )
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)