Spaces:
Paused
Paused
Update proxy_handler.py
Browse files- proxy_handler.py +54 -52
proxy_handler.py
CHANGED
|
@@ -53,9 +53,6 @@ class ProxyHandler:
|
|
| 53 |
cleaned_text = re.sub(r'</?details[^>]*>', '', cleaned_text)
|
| 54 |
|
| 55 |
# 3. Handle markdown blockquotes.
|
| 56 |
-
# The reference implementation removes "> " at the start of lines.
|
| 57 |
-
# The original Python implementation is more robust and correctly
|
| 58 |
-
# preserves multi-level quotes (e.g., '>>'). We'll keep it.
|
| 59 |
cleaned_text = re.sub(r'^\s*>\s*(?!>)', '', cleaned_text, flags=re.MULTILINE)
|
| 60 |
|
| 61 |
# 4. Remove other known text artifacts.
|
|
@@ -66,13 +63,16 @@ class ProxyHandler:
|
|
| 66 |
|
| 67 |
def _clean_answer_content(self, text: str) -> str:
|
| 68 |
"""
|
| 69 |
-
Cleans
|
| 70 |
Does NOT strip whitespace to preserve markdown in streams.
|
| 71 |
"""
|
| 72 |
if not text:
|
| 73 |
return ""
|
| 74 |
-
# Remove
|
| 75 |
cleaned_text = re.sub(r'<glm_block.*?</glm_block>', '', text, flags=re.DOTALL)
|
|
|
|
|
|
|
|
|
|
| 76 |
return cleaned_text
|
| 77 |
|
| 78 |
def _serialize_msgs(self, msgs) -> list:
|
|
@@ -102,7 +102,6 @@ class ProxyHandler:
|
|
| 102 |
think_open = False
|
| 103 |
yielded_think_buffer = ""
|
| 104 |
current_raw_thinking = ""
|
| 105 |
-
answer_started = False
|
| 106 |
|
| 107 |
async def yield_delta(content_type: str, text: str):
|
| 108 |
nonlocal think_open, yielded_think_buffer
|
|
@@ -142,41 +141,39 @@ class ProxyHandler:
|
|
| 142 |
|
| 143 |
payload_str = line[6:]
|
| 144 |
if payload_str == '[DONE]':
|
| 145 |
-
if think_open:
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
| 147 |
try:
|
| 148 |
dat = json.loads(payload_str).get("data", {})
|
| 149 |
-
except (json.JSONDecodeError, AttributeError):
|
|
|
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
yield item
|
| 157 |
continue
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
if match:
|
| 167 |
-
thinking_part, answer_part = match.groups()
|
| 168 |
-
# FIX: Expanded to a proper indented block
|
| 169 |
-
async for item in yield_delta("thinking", thinking_part):
|
| 170 |
-
yield item
|
| 171 |
-
# FIX: Expanded to a proper indented block
|
| 172 |
-
if answer_part:
|
| 173 |
-
async for item in yield_delta("answer", answer_part):
|
| 174 |
-
yield item
|
| 175 |
-
answer_started = True
|
| 176 |
-
else:
|
| 177 |
-
# FIX: Expanded to a proper indented block
|
| 178 |
async for item in yield_delta("thinking", current_raw_thinking):
|
| 179 |
yield item
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
except Exception:
|
| 181 |
logger.exception("Stream error"); raise
|
| 182 |
|
|
@@ -205,27 +202,32 @@ class ProxyHandler:
|
|
| 205 |
dat = json.loads(payload_str).get("data", {})
|
| 206 |
except (json.JSONDecodeError, AttributeError): continue
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
continue
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
|
| 216 |
-
if
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
if match:
|
| 223 |
-
last_thinking_content, answer_part = match.groups()
|
| 224 |
-
if answer_part: raw_answer_parts.append(answer_part)
|
| 225 |
-
answer_started = True
|
| 226 |
-
else:
|
| 227 |
last_thinking_content = current_raw_thinking
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
break
|
| 230 |
|
| 231 |
full_answer = ''.join(raw_answer_parts)
|
|
|
|
| 53 |
cleaned_text = re.sub(r'</?details[^>]*>', '', cleaned_text)
|
| 54 |
|
| 55 |
# 3. Handle markdown blockquotes.
|
|
|
|
|
|
|
|
|
|
| 56 |
cleaned_text = re.sub(r'^\s*>\s*(?!>)', '', cleaned_text, flags=re.MULTILINE)
|
| 57 |
|
| 58 |
# 4. Remove other known text artifacts.
|
|
|
|
| 63 |
|
| 64 |
def _clean_answer_content(self, text: str) -> str:
|
| 65 |
"""
|
| 66 |
+
Cleans unwanted tags from answer content.
|
| 67 |
Does NOT strip whitespace to preserve markdown in streams.
|
| 68 |
"""
|
| 69 |
if not text:
|
| 70 |
return ""
|
| 71 |
+
# Remove tool call blocks
|
| 72 |
cleaned_text = re.sub(r'<glm_block.*?</glm_block>', '', text, flags=re.DOTALL)
|
| 73 |
+
# Remove any residual details/summary blocks that might leak into the answer
|
| 74 |
+
cleaned_text = re.sub(r'<details[^>]*>.*?</details>', '', cleaned_text, flags=re.DOTALL)
|
| 75 |
+
cleaned_text = re.sub(r'<summary>.*?</summary>', '', cleaned_text, flags=re.DOTALL)
|
| 76 |
return cleaned_text
|
| 77 |
|
| 78 |
def _serialize_msgs(self, msgs) -> list:
|
|
|
|
| 102 |
think_open = False
|
| 103 |
yielded_think_buffer = ""
|
| 104 |
current_raw_thinking = ""
|
|
|
|
| 105 |
|
| 106 |
async def yield_delta(content_type: str, text: str):
|
| 107 |
nonlocal think_open, yielded_think_buffer
|
|
|
|
| 141 |
|
| 142 |
payload_str = line[6:]
|
| 143 |
if payload_str == '[DONE]':
|
| 144 |
+
if think_open:
|
| 145 |
+
yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': '</think>'}, 'finish_reason': None}]})}\n\n"
|
| 146 |
+
yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n";
|
| 147 |
+
yield "data: [DONE]\n\n";
|
| 148 |
+
return
|
| 149 |
try:
|
| 150 |
dat = json.loads(payload_str).get("data", {})
|
| 151 |
+
except (json.JSONDecodeError, AttributeError):
|
| 152 |
+
continue
|
| 153 |
|
| 154 |
+
# --- START OF REFACTORED LOGIC ---
|
| 155 |
+
phase = dat.get("phase")
|
| 156 |
+
content_chunk = dat.get("delta_content") or dat.get("edit_content")
|
| 157 |
+
|
| 158 |
+
if not content_chunk:
|
|
|
|
| 159 |
continue
|
| 160 |
|
| 161 |
+
if phase == "thinking":
|
| 162 |
+
# Accumulate raw thinking content. `edit_content` replaces the buffer.
|
| 163 |
+
if dat.get("edit_content") is not None:
|
| 164 |
+
current_raw_thinking = content_chunk
|
| 165 |
+
else:
|
| 166 |
+
current_raw_thinking += content_chunk
|
| 167 |
+
# Yield the processed delta of the accumulated thinking content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
async for item in yield_delta("thinking", current_raw_thinking):
|
| 169 |
yield item
|
| 170 |
+
|
| 171 |
+
elif phase == "answer":
|
| 172 |
+
# Directly yield the answer chunk for processing
|
| 173 |
+
async for item in yield_delta("answer", content_chunk):
|
| 174 |
+
yield item
|
| 175 |
+
# --- END OF REFACTORED LOGIC ---
|
| 176 |
+
|
| 177 |
except Exception:
|
| 178 |
logger.exception("Stream error"); raise
|
| 179 |
|
|
|
|
| 202 |
dat = json.loads(payload_str).get("data", {})
|
| 203 |
except (json.JSONDecodeError, AttributeError): continue
|
| 204 |
|
| 205 |
+
# Use the more robust phase-based logic for non-stream as well
|
| 206 |
+
phase = dat.get("phase")
|
| 207 |
+
content_chunk = dat.get("delta_content") or dat.get("edit_content")
|
|
|
|
| 208 |
|
| 209 |
+
if not content_chunk:
|
| 210 |
+
continue
|
| 211 |
|
| 212 |
+
if phase == "thinking":
|
| 213 |
+
answer_started = False # Ensure we are in thinking mode
|
| 214 |
+
if dat.get("edit_content") is not None:
|
| 215 |
+
current_raw_thinking = content_chunk
|
| 216 |
+
else:
|
| 217 |
+
current_raw_thinking += content_chunk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
last_thinking_content = current_raw_thinking
|
| 219 |
+
|
| 220 |
+
elif phase == "answer":
|
| 221 |
+
if not answer_started:
|
| 222 |
+
# First answer chunk might contain leftover thinking part, clean it.
|
| 223 |
+
cleaned_chunk = self._clean_answer_content(content_chunk)
|
| 224 |
+
if cleaned_chunk:
|
| 225 |
+
raw_answer_parts.append(cleaned_chunk)
|
| 226 |
+
answer_started = True
|
| 227 |
+
else:
|
| 228 |
+
raw_answer_parts.append(content_chunk)
|
| 229 |
+
else:
|
| 230 |
+
continue
|
| 231 |
break
|
| 232 |
|
| 233 |
full_answer = ''.join(raw_answer_parts)
|