Spaces:
Running
Running
ming
commited on
Commit
·
85dcd04
1
Parent(s):
cfe8d29
Fix JSON parsing errors in V4 NDJSON stream
Browse filesIssue:
- Model generates invalid JSON with unescaped quotes (e.g., "TVNZ's")
- JSON parsing fails with 'Expecting : delimiter' error
- Code tried to use undefined 'patch' variable after parse failure
Fixes:
- Initialize patch = None before try block
- Added JSON repair logic with two strategies:
1. Extract first complete JSON object from incomplete lines
2. Attempt to escape unescaped quotes in value fields
- Improved error logging to show more context (150 chars)
- Added CRITICAL JSON FORMATTING RULES to prompt:
- Emphasize proper quote escaping (\" for quotes in strings)
- Remind model to test JSON validity
- Warn against unescaped quotes in values
This handles malformed JSON gracefully while improving model output quality.
app/services/structured_summarizer.py
CHANGED
|
@@ -262,7 +262,15 @@ Rules:
|
|
| 262 |
* Main summary MUST be 2 sentences maximum.
|
| 263 |
* Each key point MUST be 8-12 words maximum.
|
| 264 |
* Category MUST be 1-2 words only.
|
| 265 |
-
* NO verbose explanations. NO long descriptions. BE BRIEF!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
def _build_style_instruction(self, style: str) -> str:
|
| 268 |
"""Build the style-specific instruction."""
|
|
@@ -612,6 +620,7 @@ Rules:
|
|
| 612 |
continue
|
| 613 |
|
| 614 |
# Try to parse JSON patch
|
|
|
|
| 615 |
try:
|
| 616 |
patch = json.loads(line)
|
| 617 |
|
|
@@ -626,9 +635,65 @@ Rules:
|
|
| 626 |
|
| 627 |
except json.JSONDecodeError as e:
|
| 628 |
logger.warning(
|
| 629 |
-
f"Failed to parse NDJSON line: {line[:
|
| 630 |
)
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
# Apply patch to state
|
| 634 |
is_done = self._apply_patch(state, patch)
|
|
|
|
| 262 |
* Main summary MUST be 2 sentences maximum.
|
| 263 |
* Each key point MUST be 8-12 words maximum.
|
| 264 |
* Category MUST be 1-2 words only.
|
| 265 |
+
* NO verbose explanations. NO long descriptions. BE BRIEF!
|
| 266 |
+
|
| 267 |
+
- CRITICAL JSON FORMATTING RULES:
|
| 268 |
+
* ALL string values MUST have quotes properly escaped.
|
| 269 |
+
* If a value contains a quote character, escape it as \\"
|
| 270 |
+
* Example: "value": "TVNZ\\'s legacy" (escape the apostrophe/quote)
|
| 271 |
+
* NEVER output unescaped quotes inside JSON string values.
|
| 272 |
+
* Each JSON object MUST be on a single line and be valid JSON.
|
| 273 |
+
* Test your JSON - it must parse correctly!"""
|
| 274 |
|
| 275 |
def _build_style_instruction(self, style: str) -> str:
|
| 276 |
"""Build the style-specific instruction."""
|
|
|
|
| 620 |
continue
|
| 621 |
|
| 622 |
# Try to parse JSON patch
|
| 623 |
+
patch = None
|
| 624 |
try:
|
| 625 |
patch = json.loads(line)
|
| 626 |
|
|
|
|
| 635 |
|
| 636 |
except json.JSONDecodeError as e:
|
| 637 |
logger.warning(
|
| 638 |
+
f"Failed to parse NDJSON line: {line[:150]}... Error: {e}"
|
| 639 |
)
|
| 640 |
+
# Try to extract valid JSON from the line
|
| 641 |
+
# Common issues: incomplete lines, unescaped quotes, extra text
|
| 642 |
+
try:
|
| 643 |
+
# Strategy 1: Try to find the first complete JSON object
|
| 644 |
+
brace_count = 0
|
| 645 |
+
end_pos = -1
|
| 646 |
+
for i, char in enumerate(line):
|
| 647 |
+
if char == '{':
|
| 648 |
+
brace_count += 1
|
| 649 |
+
elif char == '}':
|
| 650 |
+
brace_count -= 1
|
| 651 |
+
if brace_count == 0:
|
| 652 |
+
end_pos = i + 1
|
| 653 |
+
break
|
| 654 |
+
|
| 655 |
+
if end_pos > 0:
|
| 656 |
+
# Found a complete JSON object, try parsing just that part
|
| 657 |
+
try:
|
| 658 |
+
patch = json.loads(line[:end_pos])
|
| 659 |
+
logger.info(f"✅ Extracted valid JSON from incomplete line")
|
| 660 |
+
except:
|
| 661 |
+
pass
|
| 662 |
+
|
| 663 |
+
# Strategy 2: If still failed, try to fix common quote issues
|
| 664 |
+
if patch is None and '"value":"' in line:
|
| 665 |
+
# Try to escape unescaped quotes in the value field
|
| 666 |
+
import re
|
| 667 |
+
# Simple heuristic: if we see a pattern like "value":"...text with 'quote'..."
|
| 668 |
+
# try to escape the inner quotes
|
| 669 |
+
def try_fix_quotes(text):
|
| 670 |
+
# Try to find and close the value string properly
|
| 671 |
+
match = re.match(r'(\{"op":"[^"]+","field":"[^"]+","value":")(.*?)(.*)$', text)
|
| 672 |
+
if match:
|
| 673 |
+
prefix = match.group(1)
|
| 674 |
+
value_content = match.group(2)
|
| 675 |
+
rest = match.group(3)
|
| 676 |
+
# Escape any unescaped quotes in the value
|
| 677 |
+
value_content = value_content.replace('\\"', '__TEMP__')
|
| 678 |
+
value_content = value_content.replace('"', '\\"')
|
| 679 |
+
value_content = value_content.replace('__TEMP__', '\\"')
|
| 680 |
+
# Try to reconstruct: prefix + escaped_value + "}"
|
| 681 |
+
if rest.startswith('"}'):
|
| 682 |
+
try:
|
| 683 |
+
return json.loads(prefix + value_content + rest)
|
| 684 |
+
except:
|
| 685 |
+
pass
|
| 686 |
+
return None
|
| 687 |
+
|
| 688 |
+
repaired = try_fix_quotes(line)
|
| 689 |
+
if repaired:
|
| 690 |
+
patch = repaired
|
| 691 |
+
logger.info(f"✅ Repaired JSON by escaping quotes")
|
| 692 |
+
except Exception as repair_error:
|
| 693 |
+
logger.debug(f"JSON repair attempt failed: {repair_error}")
|
| 694 |
+
|
| 695 |
+
if patch is None:
|
| 696 |
+
continue
|
| 697 |
|
| 698 |
# Apply patch to state
|
| 699 |
is_done = self._apply_patch(state, patch)
|