huzpsb commited on
Commit
452f7e6
·
verified ·
1 Parent(s): 3dca2d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -30
app.py CHANGED
@@ -7,38 +7,33 @@ import sys
7
  import gradio as gr
8
  from huggingface_hub import hf_hub_download
9
 
10
- # --- 配置 ---
 
11
  LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz"
12
  BINARY_NAME = "llama-server"
13
  SERVER_PORT = "8080"
14
  REPO_ID = "huzpsb/heru"
15
  FILENAME = "qwq_q4k.gguf"
16
 
 
17
  def setup_server():
18
- """下载并启动 llama-server,处理动态链接库及符号链接"""
19
  print(f"[*] Downloading model: {FILENAME}...")
20
  model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
21
-
22
  if not os.path.exists(BINARY_NAME):
23
  print("[*] Downloading llama.cpp binary package...")
24
  response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True)
25
  with open("llama.tar.gz", "wb") as f:
26
  f.write(response.content)
27
-
28
  print("[*] Extracting files and handling symlinks...")
29
  with tarfile.open("llama.tar.gz", "r:gz") as tar:
30
  for member in tar.getmembers():
31
  base_name = os.path.basename(member.name)
32
- if not base_name: continue # 跳过目录本身
33
-
34
  if member.isfile():
35
- # 提取普通文件并去除路径前缀
36
  member.name = base_name
37
  tar.extract(member, path=".")
38
  elif member.issym():
39
- # 处理符号链接
40
  link_target = os.path.basename(member.linkname)
41
- # 如果链接已存在,先删除
42
  if os.path.lexists(base_name):
43
  os.remove(base_name)
44
  try:
@@ -55,25 +50,21 @@ def setup_server():
55
  print("[*] Starting llama-server with LD_LIBRARY_PATH...")
56
  new_env = os.environ.copy()
57
  current_dir = os.getcwd()
58
- # 确保 LD_LIBRARY_PATH 包含当前目录,以便加载 .so 文件
59
  new_env["LD_LIBRARY_PATH"] = f"{current_dir}:{new_env.get('LD_LIBRARY_PATH', '')}"
60
-
61
  cmd = [
62
  f"./{BINARY_NAME}",
63
  "-m", model_path,
64
  "--port", SERVER_PORT,
65
- "--ctx-size", "8192",
66
  "--n-gpu-layers", "0",
67
  "--host", "127.0.0.1"
68
  ]
69
-
70
  proc = subprocess.Popen(
71
  cmd,
72
  stdout=sys.stdout,
73
  stderr=sys.stderr,
74
  env=new_env
75
  )
76
-
77
  print("[*] Waiting for server to respond...")
78
  retries = 0
79
  while retries < 60:
@@ -85,34 +76,27 @@ def setup_server():
85
  except:
86
  time.sleep(2)
87
  retries += 1
88
-
89
  raise Exception("Server failed to start. Check logs for missing .so files.")
90
 
91
- # 初始化
92
  server_process = setup_server()
93
 
94
- # --- 修改后的预测函数 ---
95
  def predict(message, history, system_prompt, temperature):
96
- """
97
- Gradio 回调:现在支持自定义 system prompt 和 temperature
98
- """
99
  messages = [{"role": "system", "content": system_prompt}]
100
  for user_msg, bot_msg in history:
101
  messages.append({"role": "user", "content": user_msg})
102
  messages.append({"role": "assistant", "content": bot_msg})
103
  messages.append({"role": "user", "content": message})
104
-
105
  payload = {
106
  "messages": messages,
107
  "temperature": temperature,
108
  "max_tokens": 2048,
109
  "stream": False
110
  }
111
-
112
- # --- 调试日志 ---
113
  print("\n--- [Request Payload] ---")
114
  print(payload)
115
-
116
  try:
117
  response = requests.post(
118
  f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions",
@@ -121,19 +105,17 @@ def predict(message, history, system_prompt, temperature):
121
  )
122
  response.raise_for_status()
123
  result = response.json()
124
-
125
  print("--- [Response] ---")
126
- print(result) # 打印完整响应
127
-
128
  return result["choices"][0]["message"]["content"]
129
  except Exception as e:
130
  print(f"--- [Error] --- \n{str(e)}")
131
  return f"Error: {str(e)}"
132
 
133
- # --- Gradio UI (支持 API 参数) ---
134
  with gr.Blocks(theme="soft") as demo:
135
  gr.Markdown("## Qwen3 Inference via llama-server")
136
-
137
  chat_interface = gr.ChatInterface(
138
  fn=predict,
139
  additional_inputs=[
@@ -144,4 +126,4 @@ with gr.Blocks(theme="soft") as demo:
144
  )
145
 
146
  if __name__ == "__main__":
147
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
7
  import gradio as gr
8
  from huggingface_hub import hf_hub_download
9
 
10
+ # Llama-cpp-python, f- you for not updating for months! :(
11
+
12
  LLAMA_CPP_RELEASE_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b8093/llama-b8093-bin-ubuntu-x64.tar.gz"
13
  BINARY_NAME = "llama-server"
14
  SERVER_PORT = "8080"
15
  REPO_ID = "huzpsb/heru"
16
  FILENAME = "qwq_q4k.gguf"
17
 
18
+
19
  def setup_server():
 
20
  print(f"[*] Downloading model: {FILENAME}...")
21
  model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
 
22
  if not os.path.exists(BINARY_NAME):
23
  print("[*] Downloading llama.cpp binary package...")
24
  response = requests.get(LLAMA_CPP_RELEASE_URL, stream=True)
25
  with open("llama.tar.gz", "wb") as f:
26
  f.write(response.content)
 
27
  print("[*] Extracting files and handling symlinks...")
28
  with tarfile.open("llama.tar.gz", "r:gz") as tar:
29
  for member in tar.getmembers():
30
  base_name = os.path.basename(member.name)
31
+ if not base_name: continue
 
32
  if member.isfile():
 
33
  member.name = base_name
34
  tar.extract(member, path=".")
35
  elif member.issym():
 
36
  link_target = os.path.basename(member.linkname)
 
37
  if os.path.lexists(base_name):
38
  os.remove(base_name)
39
  try:
 
50
  print("[*] Starting llama-server with LD_LIBRARY_PATH...")
51
  new_env = os.environ.copy()
52
  current_dir = os.getcwd()
 
53
  new_env["LD_LIBRARY_PATH"] = f"{current_dir}:{new_env.get('LD_LIBRARY_PATH', '')}"
 
54
  cmd = [
55
  f"./{BINARY_NAME}",
56
  "-m", model_path,
57
  "--port", SERVER_PORT,
58
+ "--ctx-size", "81920",
59
  "--n-gpu-layers", "0",
60
  "--host", "127.0.0.1"
61
  ]
 
62
  proc = subprocess.Popen(
63
  cmd,
64
  stdout=sys.stdout,
65
  stderr=sys.stderr,
66
  env=new_env
67
  )
 
68
  print("[*] Waiting for server to respond...")
69
  retries = 0
70
  while retries < 60:
 
76
  except:
77
  time.sleep(2)
78
  retries += 1
79
+
80
  raise Exception("Server failed to start. Check logs for missing .so files.")
81
 
82
+
83
  server_process = setup_server()
84
 
85
+
86
  def predict(message, history, system_prompt, temperature):
 
 
 
87
  messages = [{"role": "system", "content": system_prompt}]
88
  for user_msg, bot_msg in history:
89
  messages.append({"role": "user", "content": user_msg})
90
  messages.append({"role": "assistant", "content": bot_msg})
91
  messages.append({"role": "user", "content": message})
 
92
  payload = {
93
  "messages": messages,
94
  "temperature": temperature,
95
  "max_tokens": 2048,
96
  "stream": False
97
  }
 
 
98
  print("\n--- [Request Payload] ---")
99
  print(payload)
 
100
  try:
101
  response = requests.post(
102
  f"http://127.0.0.1:{SERVER_PORT}/v1/chat/completions",
 
105
  )
106
  response.raise_for_status()
107
  result = response.json()
 
108
  print("--- [Response] ---")
109
+ print(result)
 
110
  return result["choices"][0]["message"]["content"]
111
  except Exception as e:
112
  print(f"--- [Error] --- \n{str(e)}")
113
  return f"Error: {str(e)}"
114
 
115
+
116
  with gr.Blocks(theme="soft") as demo:
117
  gr.Markdown("## Qwen3 Inference via llama-server")
118
+
119
  chat_interface = gr.ChatInterface(
120
  fn=predict,
121
  additional_inputs=[
 
126
  )
127
 
128
  if __name__ == "__main__":
129
+ demo.launch(server_name="0.0.0.0", server_port=7860)