huzpsb commited on
Commit
c3cedc9
·
verified ·
1 Parent(s): 0f769f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -21
app.py CHANGED
@@ -1,37 +1,117 @@
 
 
 
 
 
1
  import gradio as gr
2
  from huggingface_hub import hf_hub_download
3
- from llama_cpp import Llama
4
 
5
- model_path = hf_hub_download(
6
- repo_id="huzpsb/heru",
7
- filename="qwq_q4k.gguf"
8
- )
 
 
9
 
10
- print(f"Loading: {model_path} ...")
11
- llm = Llama(
12
- model_path=model_path,
13
- n_ctx=81920,
14
- n_gpu_layers=0,
15
- verbose=True
16
- )
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def predict(message, history):
 
 
 
20
  messages = []
21
  for user_msg, bot_msg in history:
22
  messages.append({"role": "user", "content": user_msg})
23
  messages.append({"role": "assistant", "content": bot_msg})
24
  messages.append({"role": "user", "content": message})
25
- response = llm.create_chat_completion(
26
- messages=messages,
27
- stream=False,
28
- temperature=0.7,
29
- max_tokens=2048
30
- )
31
- return response["choices"][0]["message"]["content"]
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- with gr.Blocks(title="Llama-CPP Inference") as demo:
 
 
35
  gr.ChatInterface(
36
  fn=predict,
37
  chatbot=gr.Chatbot(height=600),
@@ -43,4 +123,4 @@ with gr.Blocks(title="Llama-CPP Inference") as demo:
43
  )
44
 
45
  if __name__ == "__main__":
46
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ import os
2
+ import time
3
+ import subprocess
4
+ import requests
5
+ import tarfile
6
  import gradio as gr
7
  from huggingface_hub import hf_hub_download
 
8
 
9
+ # --- 配置 ---
10
+ LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz"
11
+ BINARY_NAME = "llama-server"
12
+ SERVER_PORT = "8080" # llama-server 内部运行端口
13
+ REPO_ID = "huzpsb/heru"
14
+ FILENAME = "qwen3p_q4k.gguf"
15
 
16
+ def setup_server():
17
+ """下载并启动 llama-server"""
18
+
19
+ # 1. 下载模型 (如果不存在)
20
+ print(f"Downloading model: {FILENAME}...")
21
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
22
+ print(f"Model ready at: {model_path}")
23
 
24
+ # 2. 下载并解压 llama.cpp binary
25
+ if not os.path.exists(BINARY_NAME):
26
+ print("Downloading llama.cpp binary...")
27
+ response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True)
28
+ if response.status_code == 200:
29
+ with open("llama.tar.gz", "wb") as f:
30
+ f.write(response.content)
31
+
32
+ print("Extracting binary...")
33
+ with tarfile.open("llama.tar.gz", "r:gz") as tar:
34
+ # 扁平化解压:找到 build/bin/llama-server 并提取到当前目录
35
+ for member in tar.getmembers():
36
+ if member.name.endswith(BINARY_NAME):
37
+ member.name = BINARY_NAME # 重命名以便直接放在根目录
38
+ tar.extract(member, path=".")
39
+ break
40
+
41
+ # 赋予执行权限
42
+ os.chmod(BINARY_NAME, 0o755)
43
+ else:
44
+ raise Exception("Failed to download llama.cpp binary")
45
+
46
+ # 3. 启动后台进程
47
+ print("Starting llama-server...")
48
+ cmd = [
49
+ f"./{BINARY_NAME}",
50
+ "-m", model_path,
51
+ "--port", SERVER_PORT,
52
+ "--ctx-size", "8192", # 根据你的 Space 硬件调整上下文
53
+ "--n-gpu-layers", "0", # CPU Space 设为 0,如有 GPU 可设为 99
54
+ "--host", "127.0.0.1"
55
+ ]
56
+
57
+ # 使用 Popen 不阻塞主线程
58
+ proc = subprocess.Popen(
59
+ cmd,
60
+ stdout=subprocess.PIPE,
61
+ stderr=subprocess.PIPE,
62
+ text=True
63
+ )
64
+
65
+ # 4. 等待服务就绪 (健康检查)
66
+ print("Waiting for server to act up...")
67
+ retries = 0
68
+ while retries < 30:
69
+ try:
70
+ requests.get(f"http://127.0.0.1:{SERVER_PORT}/health")
71
+ print("Server is ready!")
72
+ return proc
73
+ except requests.exceptions.ConnectionError:
74
+ time.sleep(2)
75
+ retries += 1
76
+ print(f"Waiting for server... ({retries}/30)")
77
+
78
+ raise Exception("Server failed to start. Check logs.")
79
+
80
+ # --- 初始化服务 ---
81
+ # 注意:在 HF Spaces 中,Global 作用域的代码会在启动时运行
82
+ server_process = setup_server()
83
 
84
  def predict(message, history):
85
+ """Gradio 回调:转发请求给本地 llama-server"""
86
+
87
+ # 构造 OpenAI 格式的 Messages
88
  messages = []
89
  for user_msg, bot_msg in history:
90
  messages.append({"role": "user", "content": user_msg})
91
  messages.append({"role": "assistant", "content": bot_msg})
92
  messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
93
 
94
+ payload = {
95
+ "messages": messages,
96
+ "temperature": 0.7,
97
+ "max_tokens": 2048,
98
+ "stream": False # 如果需要流式输出,需要改写 requests 处理
99
+ }
100
+
101
+ try:
102
+ response = requests.post(
103
+ f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions",
104
+ json=payload,
105
+ headers={"Content-Type": "application/json"}
106
+ )
107
+ response.raise_for_status()
108
+ return response.json()["choices"][0]["message"]["content"]
109
+ except Exception as e:
110
+ return f"Error: {str(e)}"
111
 
112
+ # --- Gradio UI ---
113
+ with gr.Blocks(title="Qwen3 Llama-CPP Inference") as demo:
114
+ gr.Markdown(f"### Running Qwen3 via llama-server (b8093)")
115
  gr.ChatInterface(
116
  fn=predict,
117
  chatbot=gr.Chatbot(height=600),
 
123
  )
124
 
125
  if __name__ == "__main__":
126
+ demo.launch(server_name="0.0.0.0", server_port=7860)