Spaces:

zai-org
/

GLM-4.5-Space

Running

App Files Files Community

zRzRzRzRzRzRzR commited on Jul 28

Commit

325c020

1 Parent(s): 9ec8fec

update

Browse files

Files changed (1) hide show

app.py +36 -74

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import copy
-import re
 import time
 import html
 from openai import OpenAI
@@ -11,7 +10,6 @@ stop_generation = False
 def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
     global stop_generation
     client = OpenAI()
     response = client.chat.completions.create(
         model="GLM-4.5",
         messages=messages,
@@ -19,72 +17,43 @@ def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
         stream=True,
         max_tokens=65536,
         extra_body={
-            "thinking":
-                {
-                    "type": "enabled" if thinking_enabled else "disabled",
-                }
         }
     )
     for chunk in response:
         if stop_generation:
             break
         if chunk.choices and chunk.choices[0].delta:
-            delta = chunk.choices[0].delta
-            yield delta
 class GLM45Model:
     def __init__(self):
-        self.reset_state()
     def reset_state(self):
-        self.accumulated_text = ""
-    def _strip_html(self, text: str) -> str:
-        return re.sub(r"<[^>]+>", "", text).strip()
-    def _wrap_text(self, text: str):
-        return [{"type": "text", "text": text}]
-    def _parse_thinking_content(self, text: str):
-        thinking_content = ""
-        regular_content = ""
-        if "<think>" in text:
-            think_pattern = r'<think>(.*?)</think>'
-            think_match = re.search(think_pattern, text, re.DOTALL)
-            if think_match:
-                thinking_content = think_match.group(1).strip()
-                regular_content = re.sub(think_pattern, '', text, flags=re.DOTALL).strip()
-            else:
-                think_start = text.find("<think>")
-                if think_start != -1:
-                    thinking_content = text[think_start + 7:]
-                    regular_content = text[:think_start].strip()
-        else:
-            regular_content = text
-        return thinking_content, regular_content
-    def _render_response(self, thinking_content: str, regular_content: str, skip_think: bool = False):
         html_parts = []
-        if thinking_content and not skip_think:
-            thinking_escaped = html.escape(thinking_content).replace("\n", "<br>")
             think_html = (
-                    "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>💭 Thinking</summary>"
                     "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
-                    + thinking_escaped +
                     "</div></details>"
             )
             html_parts.append(think_html)
         if regular_content:
-            content_escaped = html.escape(regular_content)
-            content_formatted = content_escaped.replace("\n", "<br>")
-            content_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_formatted}</div>"
             html_parts.append(content_html)
         return "".join(html_parts)
@@ -93,21 +62,20 @@ class GLM45Model:
         msgs = []
         if sys_prompt.strip():
             msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
         for h in raw_hist:
             if h["role"] == "user":
-                msgs.append({"role": "user", "content": self._wrap_text(h["content"])})
             else:
-                raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL)
-                clean_content = self._strip_html(raw).strip()
                 if clean_content:
-                    msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)})
         return msgs
-    def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0):
         global stop_generation
         stop_generation = False
         msgs = self._build_messages(raw_hist, sys_prompt)
         self.reset_state()
         try:
@@ -115,20 +83,16 @@ class GLM45Model:
                 if stop_generation:
                     break
-                delta_content = ""
                 if hasattr(delta, 'content') and delta.content:
-                    delta_content = delta.content
-                elif isinstance(delta, dict) and 'content' in delta and delta['content']:
-                    delta_content = delta['content']
-                if delta_content:
-                    self.accumulated_text += delta_content
-                    thinking_content, regular_content = self._parse_thinking_content(self.accumulated_text)
-                    yield self._render_response(thinking_content, regular_content, not thinking_enabled)
         except Exception as e:
-            error_msg = f"Error during streaming: {str(e)}"
-            yield self._render_response("", error_msg)
 glm45 = GLM45Model()
@@ -141,10 +105,10 @@ def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
     if not msg.strip():
         return raw_hist, copy.deepcopy(raw_hist), ""
-    user_rec = {"role": "user", "content": msg.strip()}
     if raw_hist is None:
         raw_hist = []
-    raw_hist.append(user_rec)
     place = {"role": "assistant", "content": ""}
     raw_hist.append(place)
@@ -157,12 +121,9 @@ def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
             place["content"] = chunk
             yield raw_hist, copy.deepcopy(raw_hist), ""
     except Exception as e:
-        error_content = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
-        place["content"] = error_content
         yield raw_hist, copy.deepcopy(raw_hist), ""
-    yield raw_hist, copy.deepcopy(raw_hist), ""
 def reset():
     global stop_generation
@@ -177,13 +138,14 @@ with demo:
     gr.HTML(
         "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Demo</div>"
         "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>"
-        "This demo uses the API version of the service for faster response.<br>"
-        "Chat only. For tool use, MCP support, and web search, please refer to the API.</div>"
-        "<div style='text-align:center;'><a href='https://huggingface.co/zai-org/GLM-4.5'>Model Hub</a> | "
         "<a href='https://github.com/zai-org/GLM-4.5'>Github</a> | "
         "<a href='http://z.ai/blog/glm-4.5'>Blog</a> | "
-        "<a href='https://docs.bigmodel.cn/cn/guide/models/text/glm-4.5'>API</a></div>"
     )
     raw_history = gr.State([])
     with gr.Row():
@@ -204,8 +166,8 @@ with demo:
             thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
             gr.HTML(
                 "<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>"
-                "ON: Enable model thinking.<br>"
-                "OFF: Not enable model thinking, the model will directly answer the question without reasoning."
                 "</div>"
             )
             temperature_slider = gr.Slider(

 import copy
 import time
 import html
 from openai import OpenAI
 def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
     global stop_generation
     client = OpenAI()
     response = client.chat.completions.create(
         model="GLM-4.5",
         messages=messages,
         stream=True,
         max_tokens=65536,
         extra_body={
+            "thinking": {
+                "type": "enabled" if thinking_enabled else "disabled",
+            }
         }
     )
     for chunk in response:
         if stop_generation:
             break
         if chunk.choices and chunk.choices[0].delta:
+            yield chunk.choices[0].delta
 class GLM45Model:
     def __init__(self):
+        self.accumulated_content = ""
+        self.accumulated_reasoning = ""
     def reset_state(self):
+        self.accumulated_content = ""
+        self.accumulated_reasoning = ""
+    def _render_response(self, reasoning_content, regular_content, skip_think=False):
         html_parts = []
+        if reasoning_content and not skip_think:
+            reasoning_escaped = html.escape(reasoning_content).replace("\n", "<br>")
             think_html = (
+                    "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>🤔 Thinking</summary>"
                     "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
+                    + reasoning_escaped +
                     "</div></details>"
             )
             html_parts.append(think_html)
         if regular_content:
+            content_escaped = html.escape(regular_content).replace("\n", "<br>")
+            content_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_escaped}</div>"
             html_parts.append(content_html)
         return "".join(html_parts)
         msgs = []
         if sys_prompt.strip():
             msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
         for h in raw_hist:
             if h["role"] == "user":
+                msgs.append({"role": "user", "content": [{"type": "text", "text": h["content"]}]})
             else:
+                clean_content = html.escape(h["content"]).replace("<br>", "\n")
                 if clean_content:
+                    msgs.append({"role": "assistant", "content": [{"type": "text", "text": clean_content}]})
         return msgs
+    def stream_generate(self, raw_hist, sys_prompt, thinking_enabled=True, temperature=1.0):
         global stop_generation
         stop_generation = False
         msgs = self._build_messages(raw_hist, sys_prompt)
         self.reset_state()
         try:
                 if stop_generation:
                     break
                 if hasattr(delta, 'content') and delta.content:
+                    self.accumulated_content += delta.content
+                if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
+                    self.accumulated_reasoning += delta.reasoning_content
+                yield self._render_response(self.accumulated_reasoning, self.accumulated_content, not thinking_enabled)
         except Exception as e:
+            yield self._render_response("", f"Error: {str(e)}")
 glm45 = GLM45Model()
     if not msg.strip():
         return raw_hist, copy.deepcopy(raw_hist), ""
     if raw_hist is None:
         raw_hist = []
+    raw_hist.append({"role": "user", "content": msg.strip()})
     place = {"role": "assistant", "content": ""}
     raw_hist.append(place)
             place["content"] = chunk
             yield raw_hist, copy.deepcopy(raw_hist), ""
     except Exception as e:
+        place["content"] = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
         yield raw_hist, copy.deepcopy(raw_hist), ""
 def reset():
     global stop_generation
     gr.HTML(
         "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Demo</div>"
         "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>"
+        "This demo uses the API version of the service for faster response speeds.<br>"
+        "Only chat functionality is supported. For tool usage, MCP support, and web search, please refer to the API documentation.</div>"
+        "<div style='text-align:center;'><a href='https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b'>Model</a> | "
         "<a href='https://github.com/zai-org/GLM-4.5'>Github</a> | "
         "<a href='http://z.ai/blog/glm-4.5'>Blog</a> | "
+        "<a href='https://docs.bigmodel.cn/cn/guide/models/text/glm-4.5'>API Docs</a></div>"
     )
     raw_history = gr.State([])
     with gr.Row():
             thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
             gr.HTML(
                 "<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>"
+                "Enabled: Activates the model's thinking capability. The model will decide whether to think based on the situation and may return empty thinking content.<br>"
+                "Disabled: Disables the model's thinking capability. The model will answer questions directly without reasoning."
                 "</div>"
             )
             temperature_slider = gr.Slider(