ai_chat_api / utils /json_extractor.py
Soumik Bose
Add local wheel with LFS
a2e3298
import json
import logging
from typing import List, Any
logger = logging.getLogger("json-extractor")
def find_balanced_closing_index(text: str, start_index: int) -> int:
"""
Finds the matching closing bracket for the bracket at start_index.
Ignores brackets inside strings and comments.
"""
start_char = text[start_index]
end_char = '}' if start_char == '{' else ']'
depth = 0
in_double_quote = False
in_single_quote = False
in_backtick = False
in_line_comment = False
in_block_comment = False
is_escaped = False
length = len(text)
i = start_index
while i < length:
char = text[i]
next_char = text[i+1] if i + 1 < length else ''
# Handle Escaping
if is_escaped:
is_escaped = False
i += 1
continue
if char == '\\' and not in_line_comment and not in_block_comment:
is_escaped = True
i += 1
continue
# Handle Comments
if in_line_comment:
if char == '\n': in_line_comment = False
i += 1
continue
if in_block_comment:
if char == '*' and next_char == '/':
in_block_comment = False
i += 2
continue
i += 1
continue
# Check comment starts
if not in_double_quote and not in_single_quote and not in_backtick:
if char == '/' and next_char == '/':
in_line_comment = True
i += 2
continue
if char == '/' and next_char == '*':
in_block_comment = True
i += 2
continue
# Handle Strings
if in_double_quote:
if char == '"': in_double_quote = False
i += 1
continue
if in_single_quote:
if char == "'": in_single_quote = False
i += 1
continue
if in_backtick:
if char == '`': in_backtick = False
i += 1
continue
if char == '"':
in_double_quote = True
i += 1
continue
if char == "'":
in_single_quote = True
i += 1
continue
if char == '`':
in_backtick = True
i += 1
continue
# Handle Bracket Counting
if char == start_char:
depth += 1
elif char == end_char:
depth -= 1
if depth == 0:
return i
i += 1
return -1
def extract_json_from_content(content: str) -> List[Any]:
"""
Scans text for JSON objects/arrays using state machine logic.
"""
if not content or not isinstance(content, str):
return []
found_blocks = []
cursor = 0
length = len(content)
while cursor < length:
if content[cursor] not in ['{', '[']:
cursor += 1
continue
end_index = find_balanced_closing_index(content, cursor)
if end_index != -1:
raw_candidate = content[cursor : end_index + 1]
try:
parsed = json.loads(raw_candidate)
found_blocks.append(parsed)
cursor = end_index + 1
continue
except json.JSONDecodeError:
pass
cursor += 1
return found_blocks