import json import logging from typing import List, Any logger = logging.getLogger("json-extractor") def find_balanced_closing_index(text: str, start_index: int) -> int: """ Finds the matching closing bracket for the bracket at start_index. Ignores brackets inside strings and comments. """ start_char = text[start_index] end_char = '}' if start_char == '{' else ']' depth = 0 in_double_quote = False in_single_quote = False in_backtick = False in_line_comment = False in_block_comment = False is_escaped = False length = len(text) i = start_index while i < length: char = text[i] next_char = text[i+1] if i + 1 < length else '' # Handle Escaping if is_escaped: is_escaped = False i += 1 continue if char == '\\' and not in_line_comment and not in_block_comment: is_escaped = True i += 1 continue # Handle Comments if in_line_comment: if char == '\n': in_line_comment = False i += 1 continue if in_block_comment: if char == '*' and next_char == '/': in_block_comment = False i += 2 continue i += 1 continue # Check comment starts if not in_double_quote and not in_single_quote and not in_backtick: if char == '/' and next_char == '/': in_line_comment = True i += 2 continue if char == '/' and next_char == '*': in_block_comment = True i += 2 continue # Handle Strings if in_double_quote: if char == '"': in_double_quote = False i += 1 continue if in_single_quote: if char == "'": in_single_quote = False i += 1 continue if in_backtick: if char == '`': in_backtick = False i += 1 continue if char == '"': in_double_quote = True i += 1 continue if char == "'": in_single_quote = True i += 1 continue if char == '`': in_backtick = True i += 1 continue # Handle Bracket Counting if char == start_char: depth += 1 elif char == end_char: depth -= 1 if depth == 0: return i i += 1 return -1 def extract_json_from_content(content: str) -> List[Any]: """ Scans text for JSON objects/arrays using state machine logic. """ if not content or not isinstance(content, str): return [] found_blocks = [] cursor = 0 length = len(content) while cursor < length: if content[cursor] not in ['{', '[']: cursor += 1 continue end_index = find_balanced_closing_index(content, cursor) if end_index != -1: raw_candidate = content[cursor : end_index + 1] try: parsed = json.loads(raw_candidate) found_blocks.append(parsed) cursor = end_index + 1 continue except json.JSONDecodeError: pass cursor += 1 return found_blocks