Spaces:

studzinsky
/

bielik_app_service

Sleeping

App Files Files Community

Patryk Studzinski commited on Dec 3, 2025

Commit

6cc98f9

1 Parent(s): 093fabc

Fix: Handle double-escaped JSON in infill parser + add debug logging

Browse files

Files changed (2) hide show

app/logic/infill_utils.py +61 -9
app/main.py +22 -0

app/logic/infill_utils.py CHANGED Viewed

@@ -89,7 +89,7 @@ def detect_gaps(text: str, notation: str = "auto") -> List[GapInfo]:
     return gaps
-def parse_infill_json(raw_output: str) -> Optional[dict]:
     """
     Extract and parse JSON from LLM output.
@@ -97,10 +97,12 @@ def parse_infill_json(raw_output: str) -> Optional[dict]:
     - JSON wrapped in markdown code blocks
     - Leading/trailing text before/after JSON
     - Function-call style wrapper ({"name": "...", "arguments": {...}})
     - Minor formatting issues
     Args:
         raw_output: Raw text from LLM
     Returns:
         Parsed dict with 'filled_text' and 'gaps' keys, or None if parsing fails
@@ -114,18 +116,28 @@ def parse_infill_json(raw_output: str) -> Optional[dict]:
     }
     """
     if not raw_output:
         return None
     # Try to extract JSON from markdown code blocks
     json_block_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
     match = re.search(json_block_pattern, raw_output)
     if match:
         raw_output = match.group(1)
     # Try to find JSON object boundaries
     # Look for the outermost { }
     start_idx = raw_output.find('{')
     if start_idx == -1:
         return None
     # Find matching closing brace
@@ -141,28 +153,68 @@ def parse_infill_json(raw_output: str) -> Optional[dict]:
                 break
     if end_idx == -1:
         return None
     json_str = raw_output[start_idx:end_idx]
     try:
         parsed = json.loads(json_str)
-        # Handle function-call style wrapper:
-        # {"name": "filled_text", "arguments": {"filled_text": "...", "gaps": [...]}}
-        if 'arguments' in parsed and isinstance(parsed['arguments'], dict):
-            parsed = parsed['arguments']
-        # Also handle: {"name": "...", "parameters": {...}}
-        if 'parameters' in parsed and isinstance(parsed['parameters'], dict):
-            parsed = parsed['parameters']
         # Validate required fields
         if 'filled_text' not in parsed and 'gaps' not in parsed:
             return None
         return parsed
-    except json.JSONDecodeError:
         return None

     return gaps
+def parse_infill_json(raw_output: str, debug: bool = True) -> Optional[dict]:
     """
     Extract and parse JSON from LLM output.
     - JSON wrapped in markdown code blocks
     - Leading/trailing text before/after JSON
     - Function-call style wrapper ({"name": "...", "arguments": {...}})
+    - Double-escaped JSON strings in arguments field
     - Minor formatting issues
     Args:
         raw_output: Raw text from LLM
+        debug: If True, print debug info to logs
     Returns:
         Parsed dict with 'filled_text' and 'gaps' keys, or None if parsing fails
     }
     """
     if not raw_output:
+        if debug:
+            print("[INFILL_PARSER] Empty raw_output received")
         return None
+    if debug:
+        print(f"[INFILL_PARSER] Raw output length: {len(raw_output)}")
+        print(f"[INFILL_PARSER] Raw output preview: {raw_output[:500]}...")
     # Try to extract JSON from markdown code blocks
     json_block_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
     match = re.search(json_block_pattern, raw_output)
     if match:
         raw_output = match.group(1)
+        if debug:
+            print("[INFILL_PARSER] Extracted from markdown code block")
     # Try to find JSON object boundaries
     # Look for the outermost { }
     start_idx = raw_output.find('{')
     if start_idx == -1:
+        if debug:
+            print("[INFILL_PARSER] No JSON object found (no opening brace)")
         return None
     # Find matching closing brace
                 break
     if end_idx == -1:
+        if debug:
+            print("[INFILL_PARSER] No matching closing brace found")
         return None
     json_str = raw_output[start_idx:end_idx]
+    if debug:
+        print(f"[INFILL_PARSER] Extracted JSON string: {json_str[:300]}...")
     try:
         parsed = json.loads(json_str)
+        if debug:
+            print(f"[INFILL_PARSER] Parsed keys: {list(parsed.keys())}")
+        # Handle function-call style wrapper with STRING arguments (double-escaped):
+        # {"name": "fill_in_text", "arguments": "{\"filled_text\": \"...\"}"}
+        if 'arguments' in parsed:
+            args = parsed['arguments']
+            if isinstance(args, str):
+                # Arguments is a JSON string - parse it
+                if debug:
+                    print(f"[INFILL_PARSER] Arguments is a string, parsing inner JSON...")
+                    print(f"[INFILL_PARSER] Arguments string preview: {args[:200]}...")
+                try:
+                    parsed = json.loads(args)
+                    if debug:
+                        print(f"[INFILL_PARSER] Successfully parsed inner JSON, keys: {list(parsed.keys())}")
+                except json.JSONDecodeError as e:
+                    if debug:
+                        print(f"[INFILL_PARSER] Failed to parse inner JSON string: {e}")
+                    return None
+            elif isinstance(args, dict):
+                # Arguments is already a dict
+                parsed = args
+                if debug:
+                    print("[INFILL_PARSER] Arguments is already a dict")
+        # Also handle: {"name": "...", "parameters": {...}} or string parameters
+        if 'parameters' in parsed:
+            params = parsed['parameters']
+            if isinstance(params, str):
+                try:
+                    parsed = json.loads(params)
+                except json.JSONDecodeError:
+                    return None
+            elif isinstance(params, dict):
+                parsed = params
         # Validate required fields
         if 'filled_text' not in parsed and 'gaps' not in parsed:
+            if debug:
+                print(f"[INFILL_PARSER] Missing required fields. Found: {list(parsed.keys())}")
             return None
+        if debug:
+            print(f"[INFILL_PARSER] Success! filled_text present: {'filled_text' in parsed}, gaps count: {len(parsed.get('gaps', []))}")
         return parsed
+    except json.JSONDecodeError as e:
+        if debug:
+            print(f"[INFILL_PARSER] JSON decode error: {e}")
         return None

app/main.py CHANGED Viewed

@@ -393,11 +393,17 @@ async def process_infill_item(
     Returns InfillResult with status, filled_text, and gaps.
     """
     try:
         # Normalize gaps to [GAP:n] format
         normalized_text, gaps = normalize_gaps_to_tagged(item.text_with_gaps)
         if not gaps:
             # No gaps found, return original text
             return InfillResult(
                 id=item.id,
                 status="ok",
@@ -408,6 +414,9 @@ async def process_infill_item(
         # Build prompt
         chat_messages = create_infill_prompt(normalized_text, options)
         # Generate
         llm = await registry.get_model(model_name)
@@ -418,11 +427,16 @@ async def process_infill_item(
             top_p=0.9,
         )
         # Parse JSON from output
         parsed = parse_infill_json(raw_output)
         if not parsed:
             # JSON parsing failed
             return InfillResult(
                 id=item.id,
                 status="error",
@@ -445,11 +459,16 @@ async def process_infill_item(
             gap_fills.append(gap_fill)
             fills_dict[gap_fill.index] = gap_fill.choice
         # Get filled text - prefer model's version, fallback to reconstruction
         filled_text = parsed.get("filled_text")
         if not filled_text and fills_dict:
             filled_text = apply_fills(normalized_text, gaps, fills_dict)
         return InfillResult(
             id=item.id,
             status="ok",
@@ -459,6 +478,9 @@ async def process_infill_item(
         )
     except Exception as e:
         return InfillResult(
             id=item.id,
             status="error",

     Returns InfillResult with status, filled_text, and gaps.
     """
     try:
+        print(f"\n{'='*60}")
+        print(f"[INFILL] Processing item id={item.id} with model={model_name}")
+        print(f"[INFILL] Input text: {item.text_with_gaps[:200]}...")
         # Normalize gaps to [GAP:n] format
         normalized_text, gaps = normalize_gaps_to_tagged(item.text_with_gaps)
+        print(f"[INFILL] Detected {len(gaps)} gaps: {gaps}")
         if not gaps:
             # No gaps found, return original text
+            print("[INFILL] No gaps found, returning original text")
             return InfillResult(
                 id=item.id,
                 status="ok",
         # Build prompt
         chat_messages = create_infill_prompt(normalized_text, options)
+        print(f"[INFILL] Prompt messages: {len(chat_messages)} messages")
+        for i, msg in enumerate(chat_messages):
+            print(f"[INFILL] Message {i} ({msg.get('role', 'unknown')}): {str(msg.get('content', ''))[:300]}...")
         # Generate
         llm = await registry.get_model(model_name)
             top_p=0.9,
         )
+        print(f"[INFILL] Raw model output ({len(raw_output)} chars):")
+        print(f"[INFILL] {raw_output}")
         # Parse JSON from output
         parsed = parse_infill_json(raw_output)
+        print(f"[INFILL] Parsed result: {parsed}")
         if not parsed:
             # JSON parsing failed
+            print(f"[INFILL] ERROR: JSON parsing failed!")
             return InfillResult(
                 id=item.id,
                 status="error",
             gap_fills.append(gap_fill)
             fills_dict[gap_fill.index] = gap_fill.choice
+        print(f"[INFILL] Extracted {len(gap_fills)} gap fills")
         # Get filled text - prefer model's version, fallback to reconstruction
         filled_text = parsed.get("filled_text")
         if not filled_text and fills_dict:
             filled_text = apply_fills(normalized_text, gaps, fills_dict)
+        print(f"[INFILL] Final filled_text: {filled_text[:200] if filled_text else 'None'}...")
+        print(f"[INFILL] Success for item {item.id}")
         return InfillResult(
             id=item.id,
             status="ok",
         )
     except Exception as e:
+        import traceback
+        print(f"[INFILL] EXCEPTION: {str(e)}")
+        print(f"[INFILL] Traceback: {traceback.format_exc()}")
         return InfillResult(
             id=item.id,
             status="error",