# work in progress, but quite good already # able to parse json like this, even when cut in half: # { # name: John Doe, # 'age': 30, # 'some': undefined, # other: tRue, # city: "New York", # "hobbies": ["reading", 'cycling'], # married: false, # children: null, # "bio": """A multi-line # biography that # spans several lines""", # 'quote': """Another # multi-line quote # using single quotes""" # } class DirtyJson: def __init__(self): self._reset() def _reset(self): self.json_string = "" self.index = 0 self.current_char = None self.result = None self.stack = [] @staticmethod def parse_string(json_string): parser = DirtyJson() return parser.parse(json_string) def parse(self, json_string): self._reset() self.json_string = json_string self.current_char = self.json_string[0] self._parse() return self.result def feed(self, chunk): self.json_string += chunk if not self.current_char and self.json_string: self.current_char = self.json_string[0] self._parse() return self.result def _advance(self,count=1): self.index += count if self.index < len(self.json_string): self.current_char = self.json_string[self.index] else: self.current_char = None def _skip_whitespace(self): while self.current_char is not None and self.current_char.isspace(): self._advance() def _parse(self): if self.result is None: self.result = self._parse_value() else: self._continue_parsing() def _continue_parsing(self): while self.current_char is not None: if isinstance(self.result, dict): self._parse_object_content() elif isinstance(self.result, list): self._parse_array_content() elif isinstance(self.result, str): self.result = self._parse_string() else: break def _parse_value(self): self._skip_whitespace() if self.current_char == '{': return self._parse_object() elif self.current_char == '[': return self._parse_array() elif self.current_char in ['"', "'", "`"]: if self._peek(2) == self.current_char * 2: # type: ignore return self._parse_multiline_string() return self._parse_string() elif self.current_char and (self.current_char.isdigit() or self.current_char in ['-', '+']): return self._parse_number() elif self._match("true"): return True elif self._match('false'): return False elif self._match('null') or self._match("undefined"): return None elif self.current_char: return self._parse_unquoted_string() return None def _match(self, text:str) -> bool: cnt = len(text) if self._peek(cnt).lower() == text.lower(): self._advance(cnt) return True return False def _parse_object(self): obj = {} self._advance() # Skip opening brace self.stack.append(obj) self._parse_object_content() return obj def _parse_object_content(self): while self.current_char is not None: self._skip_whitespace() if self.current_char == '}': self._advance() self.stack.pop() return if self.current_char is None: self.stack.pop() return # End of input reached while parsing object key = self._parse_key() value = None self._skip_whitespace() if self.current_char == ':': self._advance() value = self._parse_value() elif self.current_char is None: value = None # End of input reached after key else: value = self._parse_value() self.stack[-1][key] = value self._skip_whitespace() if self.current_char == ',': self._advance() continue elif self.current_char != '}': if self.current_char is None: self.stack.pop() return # End of input reached after value # Allow missing comma between key-value pairs continue def _parse_key(self): self._skip_whitespace() if self.current_char in ['"', "'"]: return self._parse_string() else: return self._parse_unquoted_key() def _parse_unquoted_key(self): result = "" while self.current_char is not None and not self.current_char.isspace() and self.current_char not in [':', ',', '}', ']']: result += self.current_char self._advance() return result def _parse_array(self): arr = [] self._advance() # Skip opening bracket self.stack.append(arr) self._parse_array_content() return arr def _parse_array_content(self): while self.current_char is not None: self._skip_whitespace() if self.current_char == ']': self._advance() self.stack.pop() return value = self._parse_value() self.stack[-1].append(value) self._skip_whitespace() if self.current_char == ',': self._advance() elif self.current_char != ']': self.stack.pop() return def _parse_string(self): result = "" quote_char = self.current_char self._advance() # Skip opening quote while self.current_char is not None and self.current_char != quote_char: if self.current_char == '\\': self._advance() if self.current_char in ['"', "'", '\\', '/', 'b', 'f', 'n', 'r', 't']: result += {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}.get(self.current_char, self.current_char) elif self.current_char == 'u': unicode_char = "" for _ in range(4): if self.current_char is None: return result unicode_char += self.current_char self._advance() result += chr(int(unicode_char, 16)) continue else: result += self.current_char self._advance() if self.current_char == quote_char: self._advance() # Skip closing quote return result def _parse_multiline_string(self): result = "" quote_char = self.current_char self._advance(3) # Skip first quote while self.current_char is not None: if self.current_char == quote_char and self._peek(2) == quote_char * 2: # type: ignore self._advance(3) # Skip first quote break result += self.current_char self._advance() return result.strip() def _parse_number(self): number_str = "" while self.current_char is not None and (self.current_char.isdigit() or self.current_char in ['-', '+', '.', 'e', 'E']): number_str += self.current_char self._advance() try: return int(number_str) except ValueError: return float(number_str) def _parse_true(self): self._advance() for char in 'rue': if self.current_char != char: return None self._advance() return True def _parse_false(self): self._advance() for char in 'alse': if self.current_char != char: return None self._advance() return False def _parse_null(self): self._advance() for char in 'ull': if self.current_char != char: return None self._advance() return None def _parse_unquoted_string(self): result = "" # while self.current_char is not None and not self.current_char.isspace() and self.current_char not in [':', ',', '}', ']']: while self.current_char is not None and self.current_char not in [':', ',', '}', ']']: result += self.current_char self._advance() self._advance() return result.strip() def _peek(self, n): peek_index = self.index result = '' for _ in range(n): if peek_index < len(self.json_string): result += self.json_string[peek_index] peek_index += 1 else: break return result