Luigi Claude Opus 4.5 commited on
Commit
5f1c65f
·
1 Parent(s): 8f0478f

Fix: Restore tag-based thinking parser, remove slow marker scanning

Browse files

The UI commits (7d6f332, 0956db5) replaced the <think> tag parser with
a marker-based approach that scans full_response for strings like "---"
and "以下是總結" on every token. This caused O(n²) behavior and the
markers rarely match Qwen3's actual <think>...</think> output.

Restores parse_thinking_blocks() with:
- Both <think> and <thinking> tag support
- streaming=True for live partial thinking display
- Proper tag-based separation instead of heuristic markers

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +51 -47
app.py CHANGED
@@ -57,26 +57,45 @@ def load_model():
57
  raise
58
 
59
 
60
- def parse_thinking_blocks(content: str) -> Tuple[str, str]:
61
  """
62
  Parse thinking blocks from model output.
63
-
 
64
  Args:
65
  content: Full model response
66
-
 
67
  Returns:
68
  Tuple of (thinking_content, summary_content)
69
  """
70
- pattern = r'<thinking>(.*?)</thinking>'
71
- matches = re.findall(pattern, content, re.DOTALL)
72
-
73
- if not matches:
74
- return ("", content)
75
-
76
- thinking = '\n\n'.join(match.strip() for match in matches)
77
- summary = re.sub(pattern, '', content, flags=re.DOTALL).strip()
78
-
79
- return (thinking, summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
  def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0.6) -> Generator[Tuple[str, str], None, None]:
@@ -133,11 +152,7 @@ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0
133
  full_response = ""
134
  current_thinking = ""
135
  current_summary = warning_msg
136
- summary_started = False
137
-
138
- # Markers that indicate summary section has started
139
- SUMMARY_MARKERS = ["---", "以下是總結", "總結:", "Summary:"]
140
-
141
  try:
142
  stream = llm.create_chat_completion(
143
  messages=messages,
@@ -149,7 +164,7 @@ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0
149
  stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
150
  stream=True
151
  )
152
-
153
  for chunk in stream:
154
  if 'choices' in chunk and len(chunk['choices']) > 0:
155
  delta = chunk['choices'][0].get('delta', {})
@@ -158,36 +173,25 @@ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0
158
  # Convert to Traditional Chinese (Taiwan)
159
  converted = converter.convert(content)
160
  full_response += converted
161
-
162
- # Check if we've hit a summary marker
163
- if not summary_started:
164
- for marker in SUMMARY_MARKERS:
165
- if marker in full_response:
166
- summary_started = True
167
- # Find where summary starts
168
- marker_pos = full_response.find(marker)
169
- # Everything before marker is thinking
170
- current_thinking = full_response[:marker_pos]
171
- # Everything from marker onward is summary
172
- current_summary = warning_msg + full_response[marker_pos:]
173
- break
174
-
175
- if not summary_started:
176
- # Still in thinking phase
177
- current_thinking += converted
178
- else:
179
- # Already in summary phase, add to summary
180
- current_summary += converted
181
-
182
  # Yield both fields on every token
183
  yield (current_thinking, current_summary)
184
-
185
- # If summary never started, put everything in summary field
186
- if not summary_started and current_thinking:
187
- current_summary = warning_msg + current_thinking
188
- current_thinking = "(Model did not separate thinking from summary)"
189
- yield (current_thinking, current_summary)
190
-
191
  # Reset model state
192
  llm.reset()
193
 
 
57
  raise
58
 
59
 
60
+ def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
61
  """
62
  Parse thinking blocks from model output.
63
+ Supports both <think> and <thinking> tags.
64
+
65
  Args:
66
  content: Full model response
67
+ streaming: If True, handle unclosed <think> tags for live display
68
+
69
  Returns:
70
  Tuple of (thinking_content, summary_content)
71
  """
72
+ closed_pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
73
+ open_pattern = r'<think(?:ing)?>([^<]*)$'
74
+
75
+ # Extract completed thinking blocks
76
+ closed_matches = re.findall(closed_pattern, content, re.DOTALL)
77
+ # Remove completed blocks to get summary
78
+ remaining = re.sub(closed_pattern, '', content, flags=re.DOTALL).strip()
79
+
80
+ thinking_parts = [m.strip() for m in closed_matches if m.strip()]
81
+
82
+ if streaming:
83
+ # Check for unclosed <think> tag (model still generating thinking tokens)
84
+ open_match = re.search(open_pattern, content, re.DOTALL)
85
+ if open_match:
86
+ partial = open_match.group(1).strip()
87
+ if partial:
88
+ thinking_parts.append(partial)
89
+ # Nothing after the open tag counts as summary yet
90
+ remaining = re.sub(r'<think(?:ing)?>[^<]*$', '', remaining, flags=re.DOTALL).strip()
91
+
92
+ thinking = '\n\n'.join(thinking_parts)
93
+
94
+ if not thinking and not closed_matches:
95
+ # No thinking tags found at all
96
+ return ("", content if not content.startswith('<think') else "")
97
+
98
+ return (thinking, remaining)
99
 
100
 
101
  def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0.6) -> Generator[Tuple[str, str], None, None]:
 
152
  full_response = ""
153
  current_thinking = ""
154
  current_summary = warning_msg
155
+
 
 
 
 
156
  try:
157
  stream = llm.create_chat_completion(
158
  messages=messages,
 
164
  stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
165
  stream=True
166
  )
167
+
168
  for chunk in stream:
169
  if 'choices' in chunk and len(chunk['choices']) > 0:
170
  delta = chunk['choices'][0].get('delta', {})
 
173
  # Convert to Traditional Chinese (Taiwan)
174
  converted = converter.convert(content)
175
  full_response += converted
176
+
177
+ # Parse thinking blocks and summary (streaming=True for partial tags)
178
+ thinking_blocks, summary = parse_thinking_blocks(full_response, streaming=True)
179
+
180
+ # Update thinking field (show thinking blocks live)
181
+ current_thinking = thinking_blocks if thinking_blocks else ""
182
+
183
+ # Update summary field
184
+ current_summary = warning_msg + summary if summary else warning_msg
185
+
 
 
 
 
 
 
 
 
 
 
 
186
  # Yield both fields on every token
187
  yield (current_thinking, current_summary)
188
+
189
+ # Final parse to ensure consistency
190
+ final_thinking, final_summary = parse_thinking_blocks(full_response)
191
+ current_thinking = final_thinking if final_thinking else ""
192
+ current_summary = warning_msg + final_summary if final_summary else warning_msg
193
+ yield (current_thinking, current_summary)
194
+
195
  # Reset model state
196
  llm.reset()
197