Spaces:

Luigi
/

tiny-scribe

Running

Luigi Claude Opus 4.5 commited on Jan 30

Commit

5f1c65f

1 Parent(s): 8f0478f

Fix: Restore tag-based thinking parser, remove slow marker scanning

The UI commits (7d6f332, 0956db5) replaced the <think> tag parser with
a marker-based approach that scans full_response for strings like "---"
and "以下是總結" on every token. This caused O(n²) behavior and the
markers rarely match Qwen3's actual <think>...</think> output.

Restores parse_thinking_blocks() with:
- Both <think> and <thinking> tag support
- streaming=True for live partial thinking display
- Proper tag-based separation instead of heuristic markers

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show

app.py +51 -47

app.py CHANGED Viewed

@@ -57,26 +57,45 @@ def load_model():
         raise
-def parse_thinking_blocks(content: str) -> Tuple[str, str]:
     """
     Parse thinking blocks from model output.
     Args:
         content: Full model response
     Returns:
         Tuple of (thinking_content, summary_content)
     """
-    pattern = r'<thinking>(.*?)</thinking>'
-    matches = re.findall(pattern, content, re.DOTALL)
-    if not matches:
-        return ("", content)
-    thinking = '\n\n'.join(match.strip() for match in matches)
-    summary = re.sub(pattern, '', content, flags=re.DOTALL).strip()
-    return (thinking, summary)
 def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0.6) -> Generator[Tuple[str, str], None, None]:
@@ -133,11 +152,7 @@ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0
     full_response = ""
     current_thinking = ""
     current_summary = warning_msg
-    summary_started = False
-    # Markers that indicate summary section has started
-    SUMMARY_MARKERS = ["---", "以下是總結", "總結：", "Summary:"]
     try:
         stream = llm.create_chat_completion(
             messages=messages,
@@ -149,7 +164,7 @@ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0
             stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
             stream=True
         )
         for chunk in stream:
             if 'choices' in chunk and len(chunk['choices']) > 0:
                 delta = chunk['choices'][0].get('delta', {})
@@ -158,36 +173,25 @@ def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0
                     # Convert to Traditional Chinese (Taiwan)
                     converted = converter.convert(content)
                     full_response += converted
-                    # Check if we've hit a summary marker
-                    if not summary_started:
-                        for marker in SUMMARY_MARKERS:
-                            if marker in full_response:
-                                summary_started = True
-                                # Find where summary starts
-                                marker_pos = full_response.find(marker)
-                                # Everything before marker is thinking
-                                current_thinking = full_response[:marker_pos]
-                                # Everything from marker onward is summary
-                                current_summary = warning_msg + full_response[marker_pos:]
-                                break
-                        if not summary_started:
-                            # Still in thinking phase
-                            current_thinking += converted
-                    else:
-                        # Already in summary phase, add to summary
-                        current_summary += converted
                     # Yield both fields on every token
                     yield (current_thinking, current_summary)
-        # If summary never started, put everything in summary field
-        if not summary_started and current_thinking:
-            current_summary = warning_msg + current_thinking
-            current_thinking = "(Model did not separate thinking from summary)"
-            yield (current_thinking, current_summary)
         # Reset model state
         llm.reset()

         raise
+def parse_thinking_blocks(content: str, streaming: bool = False) -> Tuple[str, str]:
     """
     Parse thinking blocks from model output.
+    Supports both <think> and <thinking> tags.
     Args:
         content: Full model response
+        streaming: If True, handle unclosed <think> tags for live display
     Returns:
         Tuple of (thinking_content, summary_content)
     """
+    closed_pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
+    open_pattern = r'<think(?:ing)?>([^<]*)$'
+    # Extract completed thinking blocks
+    closed_matches = re.findall(closed_pattern, content, re.DOTALL)
+    # Remove completed blocks to get summary
+    remaining = re.sub(closed_pattern, '', content, flags=re.DOTALL).strip()
+    thinking_parts = [m.strip() for m in closed_matches if m.strip()]
+    if streaming:
+        # Check for unclosed <think> tag (model still generating thinking tokens)
+        open_match = re.search(open_pattern, content, re.DOTALL)
+        if open_match:
+            partial = open_match.group(1).strip()
+            if partial:
+                thinking_parts.append(partial)
+            # Nothing after the open tag counts as summary yet
+            remaining = re.sub(r'<think(?:ing)?>[^<]*$', '', remaining, flags=re.DOTALL).strip()
+    thinking = '\n\n'.join(thinking_parts)
+    if not thinking and not closed_matches:
+        # No thinking tags found at all
+        return ("", content if not content.startswith('<think') else "")
+    return (thinking, remaining)
 def summarize_streaming(file_obj, max_tokens: int = 2048, temperature: float = 0.6) -> Generator[Tuple[str, str], None, None]:
     full_response = ""
     current_thinking = ""
     current_summary = warning_msg
     try:
         stream = llm.create_chat_completion(
             messages=messages,
             stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
             stream=True
         )
         for chunk in stream:
             if 'choices' in chunk and len(chunk['choices']) > 0:
                 delta = chunk['choices'][0].get('delta', {})
                     # Convert to Traditional Chinese (Taiwan)
                     converted = converter.convert(content)
                     full_response += converted
+                    # Parse thinking blocks and summary (streaming=True for partial tags)
+                    thinking_blocks, summary = parse_thinking_blocks(full_response, streaming=True)
+                    # Update thinking field (show thinking blocks live)
+                    current_thinking = thinking_blocks if thinking_blocks else ""
+                    # Update summary field
+                    current_summary = warning_msg + summary if summary else warning_msg
                     # Yield both fields on every token
                     yield (current_thinking, current_summary)
+        # Final parse to ensure consistency
+        final_thinking, final_summary = parse_thinking_blocks(full_response)
+        current_thinking = final_thinking if final_thinking else ""
+        current_summary = warning_msg + final_summary if final_summary else warning_msg
+        yield (current_thinking, current_summary)
         # Reset model state
         llm.reset()