Spaces:
Running
Running
Fix: Restore tag-based thinking parser, remove slow marker scanning
Browse filesThe UI commits (7d6f332, 0956db5) replaced the <think> tag parser with
a marker-based approach that scans full_response for strings like "---"
and "以下是總結" on every token. This caused O(n²) behavior and the
markers rarely match Qwen3's actual <think>...</think> output.
Restores parse_thinking_blocks() with:
- Both <think> and <thinking> tag support
- streaming=True for live partial thinking display
- Proper tag-based separation instead of heuristic markers
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -57,26 +57,45 @@ def load_model():
|
|
| 57 |
raise
|
| 58 |
|
| 59 |
|
| 60 |
-
def parse_thinking_blocks(content: str) -> Tuple[str, str]:
|
| 61 |
"""
|
| 62 |
Parse thinking blocks from model output.
|
| 63 |
-
|
|
|
|
| 64 |
Args:
|
| 65 |
content: Full model response
|
| 66 |
-
|
|
|
|
| 67 |
Returns:
|
| 68 |
Tuple of (thinking_content, summary_content)
|
| 69 |
"""
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
|
| 82 |
def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0.6) -> Generator[Tuple[str, str], None, None]:
|
|
@@ -133,11 +152,7 @@ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0
|
|
| 133 |
full_response = ""
|
| 134 |
current_thinking = ""
|
| 135 |
current_summary = warning_msg
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
# Markers that indicate summary section has started
|
| 139 |
-
SUMMARY_MARKERS = ["---", "以下是總結", "總結:", "Summary:"]
|
| 140 |
-
|
| 141 |
try:
|
| 142 |
stream = llm.create_chat_completion(
|
| 143 |
messages=messages,
|
|
@@ -149,7 +164,7 @@ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0
|
|
| 149 |
stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
|
| 150 |
stream=True
|
| 151 |
)
|
| 152 |
-
|
| 153 |
for chunk in stream:
|
| 154 |
if 'choices' in chunk and len(chunk['choices']) > 0:
|
| 155 |
delta = chunk['choices'][0].get('delta', {})
|
|
@@ -158,36 +173,25 @@ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0
|
|
| 158 |
# Convert to Traditional Chinese (Taiwan)
|
| 159 |
converted = converter.convert(content)
|
| 160 |
full_response += converted
|
| 161 |
-
|
| 162 |
-
#
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
# Everything from marker onward is summary
|
| 172 |
-
current_summary = warning_msg + full_response[marker_pos:]
|
| 173 |
-
break
|
| 174 |
-
|
| 175 |
-
if not summary_started:
|
| 176 |
-
# Still in thinking phase
|
| 177 |
-
current_thinking += converted
|
| 178 |
-
else:
|
| 179 |
-
# Already in summary phase, add to summary
|
| 180 |
-
current_summary += converted
|
| 181 |
-
|
| 182 |
# Yield both fields on every token
|
| 183 |
yield (current_thinking, current_summary)
|
| 184 |
-
|
| 185 |
-
#
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
# Reset model state
|
| 192 |
llm.reset()
|
| 193 |
|
|
|
|
| 57 |
raise
|
| 58 |
|
| 59 |
|
| 60 |
+
def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
|
| 61 |
"""
|
| 62 |
Parse thinking blocks from model output.
|
| 63 |
+
Supports both <think> and <thinking> tags.
|
| 64 |
+
|
| 65 |
Args:
|
| 66 |
content: Full model response
|
| 67 |
+
streaming: If True, handle unclosed <think> tags for live display
|
| 68 |
+
|
| 69 |
Returns:
|
| 70 |
Tuple of (thinking_content, summary_content)
|
| 71 |
"""
|
| 72 |
+
closed_pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
|
| 73 |
+
open_pattern = r'<think(?:ing)?>([^<]*)$'
|
| 74 |
+
|
| 75 |
+
# Extract completed thinking blocks
|
| 76 |
+
closed_matches = re.findall(closed_pattern, content, re.DOTALL)
|
| 77 |
+
# Remove completed blocks to get summary
|
| 78 |
+
remaining = re.sub(closed_pattern, '', content, flags=re.DOTALL).strip()
|
| 79 |
+
|
| 80 |
+
thinking_parts = [m.strip() for m in closed_matches if m.strip()]
|
| 81 |
+
|
| 82 |
+
if streaming:
|
| 83 |
+
# Check for unclosed <think> tag (model still generating thinking tokens)
|
| 84 |
+
open_match = re.search(open_pattern, content, re.DOTALL)
|
| 85 |
+
if open_match:
|
| 86 |
+
partial = open_match.group(1).strip()
|
| 87 |
+
if partial:
|
| 88 |
+
thinking_parts.append(partial)
|
| 89 |
+
# Nothing after the open tag counts as summary yet
|
| 90 |
+
remaining = re.sub(r'<think(?:ing)?>[^<]*$', '', remaining, flags=re.DOTALL).strip()
|
| 91 |
+
|
| 92 |
+
thinking = '\n\n'.join(thinking_parts)
|
| 93 |
+
|
| 94 |
+
if not thinking and not closed_matches:
|
| 95 |
+
# No thinking tags found at all
|
| 96 |
+
return ("", content if not content.startswith('<think') else "")
|
| 97 |
+
|
| 98 |
+
return (thinking, remaining)
|
| 99 |
|
| 100 |
|
| 101 |
def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0.6) -> Generator[Tuple[str, str], None, None]:
|
|
|
|
| 152 |
full_response = ""
|
| 153 |
current_thinking = ""
|
| 154 |
current_summary = warning_msg
|
| 155 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
try:
|
| 157 |
stream = llm.create_chat_completion(
|
| 158 |
messages=messages,
|
|
|
|
| 164 |
stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
|
| 165 |
stream=True
|
| 166 |
)
|
| 167 |
+
|
| 168 |
for chunk in stream:
|
| 169 |
if 'choices' in chunk and len(chunk['choices']) > 0:
|
| 170 |
delta = chunk['choices'][0].get('delta', {})
|
|
|
|
| 173 |
# Convert to Traditional Chinese (Taiwan)
|
| 174 |
converted = converter.convert(content)
|
| 175 |
full_response += converted
|
| 176 |
+
|
| 177 |
+
# Parse thinking blocks and summary (streaming=True for partial tags)
|
| 178 |
+
thinking_blocks, summary = parse_thinking_blocks(full_response, streaming=True)
|
| 179 |
+
|
| 180 |
+
# Update thinking field (show thinking blocks live)
|
| 181 |
+
current_thinking = thinking_blocks if thinking_blocks else ""
|
| 182 |
+
|
| 183 |
+
# Update summary field
|
| 184 |
+
current_summary = warning_msg + summary if summary else warning_msg
|
| 185 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
# Yield both fields on every token
|
| 187 |
yield (current_thinking, current_summary)
|
| 188 |
+
|
| 189 |
+
# Final parse to ensure consistency
|
| 190 |
+
final_thinking, final_summary = parse_thinking_blocks(full_response)
|
| 191 |
+
current_thinking = final_thinking if final_thinking else ""
|
| 192 |
+
current_summary = warning_msg + final_summary if final_summary else warning_msg
|
| 193 |
+
yield (current_thinking, current_summary)
|
| 194 |
+
|
| 195 |
# Reset model state
|
| 196 |
llm.reset()
|
| 197 |
|