ming commited on
Commit
85dcd04
·
1 Parent(s): cfe8d29

Fix JSON parsing errors in V4 NDJSON stream

Browse files

Issue:
- Model generates invalid JSON with unescaped quotes (e.g., "TVNZ's")
- JSON parsing fails with 'Expecting : delimiter' error
- Code tried to use undefined 'patch' variable after parse failure

Fixes:
- Initialize patch = None before try block
- Added JSON repair logic with two strategies:
1. Extract first complete JSON object from incomplete lines
2. Attempt to escape unescaped quotes in value fields
- Improved error logging to show more context (150 chars)
- Added CRITICAL JSON FORMATTING RULES to prompt:
- Emphasize proper quote escaping (\" for quotes in strings)
- Remind model to test JSON validity
- Warn against unescaped quotes in values

This handles malformed JSON gracefully while improving model output quality.

app/services/structured_summarizer.py CHANGED
@@ -262,7 +262,15 @@ Rules:
262
  * Main summary MUST be 2 sentences maximum.
263
  * Each key point MUST be 8-12 words maximum.
264
  * Category MUST be 1-2 words only.
265
- * NO verbose explanations. NO long descriptions. BE BRIEF!"""
 
 
 
 
 
 
 
 
266
 
267
  def _build_style_instruction(self, style: str) -> str:
268
  """Build the style-specific instruction."""
@@ -612,6 +620,7 @@ Rules:
612
  continue
613
 
614
  # Try to parse JSON patch
 
615
  try:
616
  patch = json.loads(line)
617
 
@@ -626,9 +635,65 @@ Rules:
626
 
627
  except json.JSONDecodeError as e:
628
  logger.warning(
629
- f"Failed to parse NDJSON line: {line[:100]}... Error: {e}"
630
  )
631
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632
 
633
  # Apply patch to state
634
  is_done = self._apply_patch(state, patch)
 
262
  * Main summary MUST be 2 sentences maximum.
263
  * Each key point MUST be 8-12 words maximum.
264
  * Category MUST be 1-2 words only.
265
+ * NO verbose explanations. NO long descriptions. BE BRIEF!
266
+
267
+ - CRITICAL JSON FORMATTING RULES:
268
+ * ALL string values MUST have quotes properly escaped.
269
+ * If a value contains a quote character, escape it as \\"
270
+ * Example: "value": "TVNZ\\'s legacy" (escape the apostrophe/quote)
271
+ * NEVER output unescaped quotes inside JSON string values.
272
+ * Each JSON object MUST be on a single line and be valid JSON.
273
+ * Test your JSON - it must parse correctly!"""
274
 
275
  def _build_style_instruction(self, style: str) -> str:
276
  """Build the style-specific instruction."""
 
620
  continue
621
 
622
  # Try to parse JSON patch
623
+ patch = None
624
  try:
625
  patch = json.loads(line)
626
 
 
635
 
636
  except json.JSONDecodeError as e:
637
  logger.warning(
638
+ f"Failed to parse NDJSON line: {line[:150]}... Error: {e}"
639
  )
640
+ # Try to extract valid JSON from the line
641
+ # Common issues: incomplete lines, unescaped quotes, extra text
642
+ try:
643
+ # Strategy 1: Try to find the first complete JSON object
644
+ brace_count = 0
645
+ end_pos = -1
646
+ for i, char in enumerate(line):
647
+ if char == '{':
648
+ brace_count += 1
649
+ elif char == '}':
650
+ brace_count -= 1
651
+ if brace_count == 0:
652
+ end_pos = i + 1
653
+ break
654
+
655
+ if end_pos > 0:
656
+ # Found a complete JSON object, try parsing just that part
657
+ try:
658
+ patch = json.loads(line[:end_pos])
659
+ logger.info(f"✅ Extracted valid JSON from incomplete line")
660
+ except:
661
+ pass
662
+
663
+ # Strategy 2: If still failed, try to fix common quote issues
664
+ if patch is None and '"value":"' in line:
665
+ # Try to escape unescaped quotes in the value field
666
+ import re
667
+ # Simple heuristic: if we see a pattern like "value":"...text with 'quote'..."
668
+ # try to escape the inner quotes
669
+ def try_fix_quotes(text):
670
+ # Try to find and close the value string properly
671
+ match = re.match(r'(\{"op":"[^"]+","field":"[^"]+","value":")(.*?)(.*)$', text)
672
+ if match:
673
+ prefix = match.group(1)
674
+ value_content = match.group(2)
675
+ rest = match.group(3)
676
+ # Escape any unescaped quotes in the value
677
+ value_content = value_content.replace('\\"', '__TEMP__')
678
+ value_content = value_content.replace('"', '\\"')
679
+ value_content = value_content.replace('__TEMP__', '\\"')
680
+ # Try to reconstruct: prefix + escaped_value + "}"
681
+ if rest.startswith('"}'):
682
+ try:
683
+ return json.loads(prefix + value_content + rest)
684
+ except:
685
+ pass
686
+ return None
687
+
688
+ repaired = try_fix_quotes(line)
689
+ if repaired:
690
+ patch = repaired
691
+ logger.info(f"✅ Repaired JSON by escaping quotes")
692
+ except Exception as repair_error:
693
+ logger.debug(f"JSON repair attempt failed: {repair_error}")
694
+
695
+ if patch is None:
696
+ continue
697
 
698
  # Apply patch to state
699
  is_done = self._apply_patch(state, patch)