Patryk Studzinski commited on
Commit
6cc98f9
·
1 Parent(s): 093fabc

Fix: Handle double-escaped JSON in infill parser + add debug logging

Browse files
Files changed (2) hide show
  1. app/logic/infill_utils.py +61 -9
  2. app/main.py +22 -0
app/logic/infill_utils.py CHANGED
@@ -89,7 +89,7 @@ def detect_gaps(text: str, notation: str = "auto") -> List[GapInfo]:
89
  return gaps
90
 
91
 
92
- def parse_infill_json(raw_output: str) -> Optional[dict]:
93
  """
94
  Extract and parse JSON from LLM output.
95
 
@@ -97,10 +97,12 @@ def parse_infill_json(raw_output: str) -> Optional[dict]:
97
  - JSON wrapped in markdown code blocks
98
  - Leading/trailing text before/after JSON
99
  - Function-call style wrapper ({"name": "...", "arguments": {...}})
 
100
  - Minor formatting issues
101
 
102
  Args:
103
  raw_output: Raw text from LLM
 
104
 
105
  Returns:
106
  Parsed dict with 'filled_text' and 'gaps' keys, or None if parsing fails
@@ -114,18 +116,28 @@ def parse_infill_json(raw_output: str) -> Optional[dict]:
114
  }
115
  """
116
  if not raw_output:
 
 
117
  return None
118
 
 
 
 
 
119
  # Try to extract JSON from markdown code blocks
120
  json_block_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
121
  match = re.search(json_block_pattern, raw_output)
122
  if match:
123
  raw_output = match.group(1)
 
 
124
 
125
  # Try to find JSON object boundaries
126
  # Look for the outermost { }
127
  start_idx = raw_output.find('{')
128
  if start_idx == -1:
 
 
129
  return None
130
 
131
  # Find matching closing brace
@@ -141,28 +153,68 @@ def parse_infill_json(raw_output: str) -> Optional[dict]:
141
  break
142
 
143
  if end_idx == -1:
 
 
144
  return None
145
 
146
  json_str = raw_output[start_idx:end_idx]
147
 
 
 
 
148
  try:
149
  parsed = json.loads(json_str)
150
 
151
- # Handle function-call style wrapper:
152
- # {"name": "filled_text", "arguments": {"filled_text": "...", "gaps": [...]}}
153
- if 'arguments' in parsed and isinstance(parsed['arguments'], dict):
154
- parsed = parsed['arguments']
155
 
156
- # Also handle: {"name": "...", "parameters": {...}}
157
- if 'parameters' in parsed and isinstance(parsed['parameters'], dict):
158
- parsed = parsed['parameters']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  # Validate required fields
161
  if 'filled_text' not in parsed and 'gaps' not in parsed:
 
 
162
  return None
 
 
 
163
 
164
  return parsed
165
- except json.JSONDecodeError:
 
 
166
  return None
167
 
168
 
 
89
  return gaps
90
 
91
 
92
+ def parse_infill_json(raw_output: str, debug: bool = True) -> Optional[dict]:
93
  """
94
  Extract and parse JSON from LLM output.
95
 
 
97
  - JSON wrapped in markdown code blocks
98
  - Leading/trailing text before/after JSON
99
  - Function-call style wrapper ({"name": "...", "arguments": {...}})
100
+ - Double-escaped JSON strings in arguments field
101
  - Minor formatting issues
102
 
103
  Args:
104
  raw_output: Raw text from LLM
105
+ debug: If True, print debug info to logs
106
 
107
  Returns:
108
  Parsed dict with 'filled_text' and 'gaps' keys, or None if parsing fails
 
116
  }
117
  """
118
  if not raw_output:
119
+ if debug:
120
+ print("[INFILL_PARSER] Empty raw_output received")
121
  return None
122
 
123
+ if debug:
124
+ print(f"[INFILL_PARSER] Raw output length: {len(raw_output)}")
125
+ print(f"[INFILL_PARSER] Raw output preview: {raw_output[:500]}...")
126
+
127
  # Try to extract JSON from markdown code blocks
128
  json_block_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
129
  match = re.search(json_block_pattern, raw_output)
130
  if match:
131
  raw_output = match.group(1)
132
+ if debug:
133
+ print("[INFILL_PARSER] Extracted from markdown code block")
134
 
135
  # Try to find JSON object boundaries
136
  # Look for the outermost { }
137
  start_idx = raw_output.find('{')
138
  if start_idx == -1:
139
+ if debug:
140
+ print("[INFILL_PARSER] No JSON object found (no opening brace)")
141
  return None
142
 
143
  # Find matching closing brace
 
153
  break
154
 
155
  if end_idx == -1:
156
+ if debug:
157
+ print("[INFILL_PARSER] No matching closing brace found")
158
  return None
159
 
160
  json_str = raw_output[start_idx:end_idx]
161
 
162
+ if debug:
163
+ print(f"[INFILL_PARSER] Extracted JSON string: {json_str[:300]}...")
164
+
165
  try:
166
  parsed = json.loads(json_str)
167
 
168
+ if debug:
169
+ print(f"[INFILL_PARSER] Parsed keys: {list(parsed.keys())}")
 
 
170
 
171
+ # Handle function-call style wrapper with STRING arguments (double-escaped):
172
+ # {"name": "fill_in_text", "arguments": "{\"filled_text\": \"...\"}"}
173
+ if 'arguments' in parsed:
174
+ args = parsed['arguments']
175
+ if isinstance(args, str):
176
+ # Arguments is a JSON string - parse it
177
+ if debug:
178
+ print(f"[INFILL_PARSER] Arguments is a string, parsing inner JSON...")
179
+ print(f"[INFILL_PARSER] Arguments string preview: {args[:200]}...")
180
+ try:
181
+ parsed = json.loads(args)
182
+ if debug:
183
+ print(f"[INFILL_PARSER] Successfully parsed inner JSON, keys: {list(parsed.keys())}")
184
+ except json.JSONDecodeError as e:
185
+ if debug:
186
+ print(f"[INFILL_PARSER] Failed to parse inner JSON string: {e}")
187
+ return None
188
+ elif isinstance(args, dict):
189
+ # Arguments is already a dict
190
+ parsed = args
191
+ if debug:
192
+ print("[INFILL_PARSER] Arguments is already a dict")
193
+
194
+ # Also handle: {"name": "...", "parameters": {...}} or string parameters
195
+ if 'parameters' in parsed:
196
+ params = parsed['parameters']
197
+ if isinstance(params, str):
198
+ try:
199
+ parsed = json.loads(params)
200
+ except json.JSONDecodeError:
201
+ return None
202
+ elif isinstance(params, dict):
203
+ parsed = params
204
 
205
  # Validate required fields
206
  if 'filled_text' not in parsed and 'gaps' not in parsed:
207
+ if debug:
208
+ print(f"[INFILL_PARSER] Missing required fields. Found: {list(parsed.keys())}")
209
  return None
210
+
211
+ if debug:
212
+ print(f"[INFILL_PARSER] Success! filled_text present: {'filled_text' in parsed}, gaps count: {len(parsed.get('gaps', []))}")
213
 
214
  return parsed
215
+ except json.JSONDecodeError as e:
216
+ if debug:
217
+ print(f"[INFILL_PARSER] JSON decode error: {e}")
218
  return None
219
 
220
 
app/main.py CHANGED
@@ -393,11 +393,17 @@ async def process_infill_item(
393
  Returns InfillResult with status, filled_text, and gaps.
394
  """
395
  try:
 
 
 
 
396
  # Normalize gaps to [GAP:n] format
397
  normalized_text, gaps = normalize_gaps_to_tagged(item.text_with_gaps)
 
398
 
399
  if not gaps:
400
  # No gaps found, return original text
 
401
  return InfillResult(
402
  id=item.id,
403
  status="ok",
@@ -408,6 +414,9 @@ async def process_infill_item(
408
 
409
  # Build prompt
410
  chat_messages = create_infill_prompt(normalized_text, options)
 
 
 
411
 
412
  # Generate
413
  llm = await registry.get_model(model_name)
@@ -418,11 +427,16 @@ async def process_infill_item(
418
  top_p=0.9,
419
  )
420
 
 
 
 
421
  # Parse JSON from output
422
  parsed = parse_infill_json(raw_output)
 
423
 
424
  if not parsed:
425
  # JSON parsing failed
 
426
  return InfillResult(
427
  id=item.id,
428
  status="error",
@@ -445,11 +459,16 @@ async def process_infill_item(
445
  gap_fills.append(gap_fill)
446
  fills_dict[gap_fill.index] = gap_fill.choice
447
 
 
 
448
  # Get filled text - prefer model's version, fallback to reconstruction
449
  filled_text = parsed.get("filled_text")
450
  if not filled_text and fills_dict:
451
  filled_text = apply_fills(normalized_text, gaps, fills_dict)
452
 
 
 
 
453
  return InfillResult(
454
  id=item.id,
455
  status="ok",
@@ -459,6 +478,9 @@ async def process_infill_item(
459
  )
460
 
461
  except Exception as e:
 
 
 
462
  return InfillResult(
463
  id=item.id,
464
  status="error",
 
393
  Returns InfillResult with status, filled_text, and gaps.
394
  """
395
  try:
396
+ print(f"\n{'='*60}")
397
+ print(f"[INFILL] Processing item id={item.id} with model={model_name}")
398
+ print(f"[INFILL] Input text: {item.text_with_gaps[:200]}...")
399
+
400
  # Normalize gaps to [GAP:n] format
401
  normalized_text, gaps = normalize_gaps_to_tagged(item.text_with_gaps)
402
+ print(f"[INFILL] Detected {len(gaps)} gaps: {gaps}")
403
 
404
  if not gaps:
405
  # No gaps found, return original text
406
+ print("[INFILL] No gaps found, returning original text")
407
  return InfillResult(
408
  id=item.id,
409
  status="ok",
 
414
 
415
  # Build prompt
416
  chat_messages = create_infill_prompt(normalized_text, options)
417
+ print(f"[INFILL] Prompt messages: {len(chat_messages)} messages")
418
+ for i, msg in enumerate(chat_messages):
419
+ print(f"[INFILL] Message {i} ({msg.get('role', 'unknown')}): {str(msg.get('content', ''))[:300]}...")
420
 
421
  # Generate
422
  llm = await registry.get_model(model_name)
 
427
  top_p=0.9,
428
  )
429
 
430
+ print(f"[INFILL] Raw model output ({len(raw_output)} chars):")
431
+ print(f"[INFILL] {raw_output}")
432
+
433
  # Parse JSON from output
434
  parsed = parse_infill_json(raw_output)
435
+ print(f"[INFILL] Parsed result: {parsed}")
436
 
437
  if not parsed:
438
  # JSON parsing failed
439
+ print(f"[INFILL] ERROR: JSON parsing failed!")
440
  return InfillResult(
441
  id=item.id,
442
  status="error",
 
459
  gap_fills.append(gap_fill)
460
  fills_dict[gap_fill.index] = gap_fill.choice
461
 
462
+ print(f"[INFILL] Extracted {len(gap_fills)} gap fills")
463
+
464
  # Get filled text - prefer model's version, fallback to reconstruction
465
  filled_text = parsed.get("filled_text")
466
  if not filled_text and fills_dict:
467
  filled_text = apply_fills(normalized_text, gaps, fills_dict)
468
 
469
+ print(f"[INFILL] Final filled_text: {filled_text[:200] if filled_text else 'None'}...")
470
+ print(f"[INFILL] Success for item {item.id}")
471
+
472
  return InfillResult(
473
  id=item.id,
474
  status="ok",
 
478
  )
479
 
480
  except Exception as e:
481
+ import traceback
482
+ print(f"[INFILL] EXCEPTION: {str(e)}")
483
+ print(f"[INFILL] Traceback: {traceback.format_exc()}")
484
  return InfillResult(
485
  id=item.id,
486
  status="error",