Upload folder using huggingface_hub

f39d337 verified 8 months ago

51.3 kB


	# Scanner produces tokens of the following types:
	# STREAM-START
	# STREAM-END
	# DIRECTIVE(name, value)
	# DOCUMENT-START
	# DOCUMENT-END
	# BLOCK-SEQUENCE-START
	# BLOCK-MAPPING-START
	# BLOCK-END
	# FLOW-SEQUENCE-START
	# FLOW-MAPPING-START
	# FLOW-SEQUENCE-END
	# FLOW-MAPPING-END
	# BLOCK-ENTRY
	# FLOW-ENTRY
	# KEY
	# VALUE
	# ALIAS(value)
	# ANCHOR(value)
	# TAG(value)
	# SCALAR(value, plain, style)
	#
	# Read comments in the Scanner code for more details.
	#

	__all__ = ['Scanner', 'ScannerError']

	from .error import MarkedYAMLError
	from .tokens import *

	class ScannerError(MarkedYAMLError):
	pass

	class SimpleKey:
	# See below simple keys treatment.

	def __init__(self, token_number, required, index, line, column, mark):
	self.token_number = token_number
	self.required = required
	self.index = index
	self.line = line
	self.column = column
	self.mark = mark

	class Scanner:

	def __init__(self):
	"""Initialize the scanner."""
	# It is assumed that Scanner and Reader will have a common descendant.
	# Reader do the dirty work of checking for BOM and converting the
	# input data to Unicode. It also adds NUL to the end.
	#
	# Reader supports the following methods
	# self.peek(i=0) # peek the next i-th character
	# self.prefix(l=1) # peek the next l characters
	# self.forward(l=1) # read the next l characters and move the pointer.

	# Had we reached the end of the stream?
	self.done = False

	# The number of unclosed '{' and '['. `flow_level == 0` means block
	# context.
	self.flow_level = 0

	# List of processed tokens that are not yet emitted.
	self.tokens = []

	# Add the STREAM-START token.
	self.fetch_stream_start()

	# Number of tokens that were emitted through the `get_token` method.
	self.tokens_taken = 0

	# The current indentation level.
	self.indent = -1

	# Past indentation levels.
	self.indents = []

	# Variables related to simple keys treatment.

	# A simple key is a key that is not denoted by the '?' indicator.
	# Example of simple keys:
	# ---
	# block simple key: value
	# ? not a simple key:
	# : { flow simple key: value }
	# We emit the KEY token before all keys, so when we find a potential
	# simple key, we try to locate the corresponding ':' indicator.
	# Simple keys should be limited to a single line and 1024 characters.

	# Can a simple key start at the current position? A simple key may
	# start:
	# - at the beginning of the line, not counting indentation spaces
	# (in block context),
	# - after '{', '[', ',' (in the flow context),
	# - after '?', ':', '-' (in the block context).
	# In the block context, this flag also signifies if a block collection
	# may start at the current position.
	self.allow_simple_key = True

	# Keep track of possible simple keys. This is a dictionary. The key
	# is `flow_level`; there can be no more that one possible simple key
	# for each level. The value is a SimpleKey record:
	# (token_number, required, index, line, column, mark)
	# A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
	# '[', or '{' tokens.
	self.possible_simple_keys = {}

	# Public methods.

	def check_token(self, *choices):
	# Check if the next token is one of the given types.
	while self.need_more_tokens():
	self.fetch_more_tokens()
	if self.tokens:
	if not choices:
	return True
	for choice in choices:
	if isinstance(self.tokens[0], choice):
	return True
	return False

	def peek_token(self):
	# Return the next token, but do not delete if from the queue.
	# Return None if no more tokens.
	while self.need_more_tokens():
	self.fetch_more_tokens()
	if self.tokens:
	return self.tokens[0]
	else:
	return None

	def get_token(self):
	# Return the next token.
	while self.need_more_tokens():
	self.fetch_more_tokens()
	if self.tokens:
	self.tokens_taken += 1
	return self.tokens.pop(0)

	# Private methods.

	def need_more_tokens(self):
	if self.done:
	return False
	if not self.tokens:
	return True
	# The current token may be a potential simple key, so we
	# need to look further.
	self.stale_possible_simple_keys()
	if self.next_possible_simple_key() == self.tokens_taken:
	return True

	def fetch_more_tokens(self):

	# Eat whitespaces and comments until we reach the next token.
	self.scan_to_next_token()

	# Remove obsolete possible simple keys.
	self.stale_possible_simple_keys()

	# Compare the current indentation and column. It may add some tokens
	# and decrease the current indentation level.
	self.unwind_indent(self.column)

	# Peek the next character.
	ch = self.peek()

	# Is it the end of stream?
	if ch == '\0':
	return self.fetch_stream_end()

	# Is it a directive?
	if ch == '%' and self.check_directive():
	return self.fetch_directive()

	# Is it the document start?
	if ch == '-' and self.check_document_start():
	return self.fetch_document_start()

	# Is it the document end?
	if ch == '.' and self.check_document_end():
	return self.fetch_document_end()

	# TODO: support for BOM within a stream.
	#if ch == '\uFEFF':
	# return self.fetch_bom() <-- issue BOMToken

	# Note: the order of the following checks is NOT significant.

	# Is it the flow sequence start indicator?
	if ch == '[':
	return self.fetch_flow_sequence_start()

	# Is it the flow mapping start indicator?
	if ch == '{':
	return self.fetch_flow_mapping_start()

	# Is it the flow sequence end indicator?
	if ch == ']':
	return self.fetch_flow_sequence_end()

	# Is it the flow mapping end indicator?
	if ch == '}':
	return self.fetch_flow_mapping_end()

	# Is it the flow entry indicator?
	if ch == ',':
	return self.fetch_flow_entry()

	# Is it the block entry indicator?
	if ch == '-' and self.check_block_entry():
	return self.fetch_block_entry()

	# Is it the key indicator?
	if ch == '?' and self.check_key():
	return self.fetch_key()

	# Is it the value indicator?
	if ch == ':' and self.check_value():
	return self.fetch_value()

	# Is it an alias?
	if ch == '*':
	return self.fetch_alias()

	# Is it an anchor?
	if ch == '&':
	return self.fetch_anchor()

	# Is it a tag?
	if ch == '!':
	return self.fetch_tag()

	# Is it a literal scalar?
	if ch == '\|' and not self.flow_level:
	return self.fetch_literal()

	# Is it a folded scalar?
	if ch == '>' and not self.flow_level:
	return self.fetch_folded()

	# Is it a single quoted scalar?
	if ch == '\'':
	return self.fetch_single()

	# Is it a double quoted scalar?
	if ch == '\"':
	return self.fetch_double()

	# It must be a plain scalar then.
	if self.check_plain():
	return self.fetch_plain()

	# No? It's an error. Let's produce a nice error message.
	raise ScannerError("while scanning for the next token", None,
	"found character %r that cannot start any token" % ch,
	self.get_mark())

	# Simple keys treatment.

	def next_possible_simple_key(self):
	# Return the number of the nearest possible simple key. Actually we
	# don't need to loop through the whole dictionary. We may replace it
	# with the following code:
	# if not self.possible_simple_keys:
	# return None
	# return self.possible_simple_keys[
	# min(self.possible_simple_keys.keys())].token_number
	min_token_number = None
	for level in self.possible_simple_keys:
	key = self.possible_simple_keys[level]
	if min_token_number is None or key.token_number < min_token_number:
	min_token_number = key.token_number
	return min_token_number

	def stale_possible_simple_keys(self):
	# Remove entries that are no longer possible simple keys. According to
	# the YAML specification, simple keys
	# - should be limited to a single line,
	# - should be no longer than 1024 characters.
	# Disabling this procedure will allow simple keys of any length and
	# height (may cause problems if indentation is broken though).
	for level in list(self.possible_simple_keys):
	key = self.possible_simple_keys[level]
	if key.line != self.line \
	or self.index-key.index > 1024:
	if key.required:
	raise ScannerError("while scanning a simple key", key.mark,
	"could not find expected ':'", self.get_mark())
	del self.possible_simple_keys[level]

	def save_possible_simple_key(self):
	# The next token may start a simple key. We check if it's possible
	# and save its position. This function is called for
	# ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.

	# Check if a simple key is required at the current position.
	required = not self.flow_level and self.indent == self.column

	# The next token might be a simple key. Let's save it's number and
	# position.
	if self.allow_simple_key:
	self.remove_possible_simple_key()
	token_number = self.tokens_taken+len(self.tokens)
	key = SimpleKey(token_number, required,
	self.index, self.line, self.column, self.get_mark())
	self.possible_simple_keys[self.flow_level] = key

	def remove_possible_simple_key(self):
	# Remove the saved possible key position at the current flow level.
	if self.flow_level in self.possible_simple_keys:
	key = self.possible_simple_keys[self.flow_level]

	if key.required:
	raise ScannerError("while scanning a simple key", key.mark,
	"could not find expected ':'", self.get_mark())

	del self.possible_simple_keys[self.flow_level]

	# Indentation functions.

	def unwind_indent(self, column):

	## In flow context, tokens should respect indentation.
	## Actually the condition should be `self.indent >= column` according to
	## the spec. But this condition will prohibit intuitively correct
	## constructions such as
	## key : {
	## }
	#if self.flow_level and self.indent > column:
	# raise ScannerError(None, None,
	# "invalid indentation or unclosed '[' or '{'",
	# self.get_mark())

	# In the flow context, indentation is ignored. We make the scanner less
	# restrictive then specification requires.
	if self.flow_level:
	return

	# In block context, we may need to issue the BLOCK-END tokens.
	while self.indent > column:
	mark = self.get_mark()
	self.indent = self.indents.pop()
	self.tokens.append(BlockEndToken(mark, mark))

	def add_indent(self, column):
	# Check if we need to increase indentation.
	if self.indent < column:
	self.indents.append(self.indent)
	self.indent = column
	return True
	return False

	# Fetchers.

	def fetch_stream_start(self):
	# We always add STREAM-START as the first token and STREAM-END as the
	# last token.

	# Read the token.
	mark = self.get_mark()

	# Add STREAM-START.
	self.tokens.append(StreamStartToken(mark, mark,
	encoding=self.encoding))


	def fetch_stream_end(self):

	# Set the current indentation to -1.
	self.unwind_indent(-1)

	# Reset simple keys.
	self.remove_possible_simple_key()
	self.allow_simple_key = False
	self.possible_simple_keys = {}

	# Read the token.
	mark = self.get_mark()

	# Add STREAM-END.
	self.tokens.append(StreamEndToken(mark, mark))

	# The steam is finished.
	self.done = True

	def fetch_directive(self):

	# Set the current indentation to -1.
	self.unwind_indent(-1)

	# Reset simple keys.
	self.remove_possible_simple_key()
	self.allow_simple_key = False

	# Scan and add DIRECTIVE.
	self.tokens.append(self.scan_directive())

	def fetch_document_start(self):
	self.fetch_document_indicator(DocumentStartToken)

	def fetch_document_end(self):
	self.fetch_document_indicator(DocumentEndToken)

	def fetch_document_indicator(self, TokenClass):

	# Set the current indentation to -1.
	self.unwind_indent(-1)

	# Reset simple keys. Note that there could not be a block collection
	# after '---'.
	self.remove_possible_simple_key()
	self.allow_simple_key = False

	# Add DOCUMENT-START or DOCUMENT-END.
	start_mark = self.get_mark()
	self.forward(3)
	end_mark = self.get_mark()
	self.tokens.append(TokenClass(start_mark, end_mark))

	def fetch_flow_sequence_start(self):
	self.fetch_flow_collection_start(FlowSequenceStartToken)

	def fetch_flow_mapping_start(self):
	self.fetch_flow_collection_start(FlowMappingStartToken)

	def fetch_flow_collection_start(self, TokenClass):

	# '[' and '{' may start a simple key.
	self.save_possible_simple_key()

	# Increase the flow level.
	self.flow_level += 1

	# Simple keys are allowed after '[' and '{'.
	self.allow_simple_key = True

	# Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
	start_mark = self.get_mark()
	self.forward()
	end_mark = self.get_mark()
	self.tokens.append(TokenClass(start_mark, end_mark))

	def fetch_flow_sequence_end(self):
	self.fetch_flow_collection_end(FlowSequenceEndToken)

	def fetch_flow_mapping_end(self):
	self.fetch_flow_collection_end(FlowMappingEndToken)

	def fetch_flow_collection_end(self, TokenClass):

	# Reset possible simple key on the current level.
	self.remove_possible_simple_key()

	# Decrease the flow level.
	self.flow_level -= 1

	# No simple keys after ']' or '}'.
	self.allow_simple_key = False

	# Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
	start_mark = self.get_mark()
	self.forward()
	end_mark = self.get_mark()
	self.tokens.append(TokenClass(start_mark, end_mark))

	def fetch_flow_entry(self):

	# Simple keys are allowed after ','.
	self.allow_simple_key = True

	# Reset possible simple key on the current level.
	self.remove_possible_simple_key()

	# Add FLOW-ENTRY.
	start_mark = self.get_mark()
	self.forward()
	end_mark = self.get_mark()
	self.tokens.append(FlowEntryToken(start_mark, end_mark))

	def fetch_block_entry(self):

	# Block context needs additional checks.
	if not self.flow_level:

	# Are we allowed to start a new entry?
	if not self.allow_simple_key:
	raise ScannerError(None, None,
	"sequence entries are not allowed here",
	self.get_mark())

	# We may need to add BLOCK-SEQUENCE-START.
	if self.add_indent(self.column):
	mark = self.get_mark()
	self.tokens.append(BlockSequenceStartToken(mark, mark))

	# It's an error for the block entry to occur in the flow context,
	# but we let the parser detect this.
	else:
	pass

	# Simple keys are allowed after '-'.
	self.allow_simple_key = True

	# Reset possible simple key on the current level.
	self.remove_possible_simple_key()

	# Add BLOCK-ENTRY.
	start_mark = self.get_mark()
	self.forward()
	end_mark = self.get_mark()
	self.tokens.append(BlockEntryToken(start_mark, end_mark))

	def fetch_key(self):

	# Block context needs additional checks.
	if not self.flow_level:

	# Are we allowed to start a key (not necessary a simple)?
	if not self.allow_simple_key:
	raise ScannerError(None, None,
	"mapping keys are not allowed here",
	self.get_mark())

	# We may need to add BLOCK-MAPPING-START.
	if self.add_indent(self.column):
	mark = self.get_mark()
	self.tokens.append(BlockMappingStartToken(mark, mark))

	# Simple keys are allowed after '?' in the block context.
	self.allow_simple_key = not self.flow_level

	# Reset possible simple key on the current level.
	self.remove_possible_simple_key()

	# Add KEY.
	start_mark = self.get_mark()
	self.forward()
	end_mark = self.get_mark()
	self.tokens.append(KeyToken(start_mark, end_mark))

	def fetch_value(self):

	# Do we determine a simple key?
	if self.flow_level in self.possible_simple_keys:

	# Add KEY.
	key = self.possible_simple_keys[self.flow_level]
	del self.possible_simple_keys[self.flow_level]
	self.tokens.insert(key.token_number-self.tokens_taken,
	KeyToken(key.mark, key.mark))

	# If this key starts a new block mapping, we need to add
	# BLOCK-MAPPING-START.
	if not self.flow_level:
	if self.add_indent(key.column):
	self.tokens.insert(key.token_number-self.tokens_taken,
	BlockMappingStartToken(key.mark, key.mark))

	# There cannot be two simple keys one after another.
	self.allow_simple_key = False

	# It must be a part of a complex key.
	else:

	# Block context needs additional checks.
	# (Do we really need them? They will be caught by the parser
	# anyway.)
	if not self.flow_level:

	# We are allowed to start a complex value if and only if
	# we can start a simple key.
	if not self.allow_simple_key:
	raise ScannerError(None, None,
	"mapping values are not allowed here",
	self.get_mark())

	# If this value starts a new block mapping, we need to add
	# BLOCK-MAPPING-START. It will be detected as an error later by
	# the parser.
	if not self.flow_level:
	if self.add_indent(self.column):
	mark = self.get_mark()
	self.tokens.append(BlockMappingStartToken(mark, mark))

	# Simple keys are allowed after ':' in the block context.
	self.allow_simple_key = not self.flow_level

	# Reset possible simple key on the current level.
	self.remove_possible_simple_key()

	# Add VALUE.
	start_mark = self.get_mark()
	self.forward()
	end_mark = self.get_mark()
	self.tokens.append(ValueToken(start_mark, end_mark))

	def fetch_alias(self):

	# ALIAS could be a simple key.
	self.save_possible_simple_key()

	# No simple keys after ALIAS.
	self.allow_simple_key = False

	# Scan and add ALIAS.
	self.tokens.append(self.scan_anchor(AliasToken))

	def fetch_anchor(self):

	# ANCHOR could start a simple key.
	self.save_possible_simple_key()

	# No simple keys after ANCHOR.
	self.allow_simple_key = False

	# Scan and add ANCHOR.
	self.tokens.append(self.scan_anchor(AnchorToken))

	def fetch_tag(self):

	# TAG could start a simple key.
	self.save_possible_simple_key()

	# No simple keys after TAG.
	self.allow_simple_key = False

	# Scan and add TAG.
	self.tokens.append(self.scan_tag())

	def fetch_literal(self):
	self.fetch_block_scalar(style='\|')

	def fetch_folded(self):
	self.fetch_block_scalar(style='>')

	def fetch_block_scalar(self, style):

	# A simple key may follow a block scalar.
	self.allow_simple_key = True

	# Reset possible simple key on the current level.
	self.remove_possible_simple_key()

	# Scan and add SCALAR.
	self.tokens.append(self.scan_block_scalar(style))

	def fetch_single(self):
	self.fetch_flow_scalar(style='\'')

	def fetch_double(self):
	self.fetch_flow_scalar(style='"')

	def fetch_flow_scalar(self, style):

	# A flow scalar could be a simple key.
	self.save_possible_simple_key()

	# No simple keys after flow scalars.
	self.allow_simple_key = False

	# Scan and add SCALAR.
	self.tokens.append(self.scan_flow_scalar(style))

	def fetch_plain(self):

	# A plain scalar could be a simple key.
	self.save_possible_simple_key()

	# No simple keys after plain scalars. But note that `scan_plain` will
	# change this flag if the scan is finished at the beginning of the
	# line.
	self.allow_simple_key = False

	# Scan and add SCALAR. May change `allow_simple_key`.
	self.tokens.append(self.scan_plain())

	# Checkers.

	def check_directive(self):

	# DIRECTIVE: ^ '%' ...
	# The '%' indicator is already checked.
	if self.column == 0:
	return True

	def check_document_start(self):

	# DOCUMENT-START: ^ '---' (' '\|'\n')
	if self.column == 0:
	if self.prefix(3) == '---' \
	and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
	return True

	def check_document_end(self):

	# DOCUMENT-END: ^ '...' (' '\|'\n')
	if self.column == 0:
	if self.prefix(3) == '...' \
	and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
	return True

	def check_block_entry(self):

	# BLOCK-ENTRY: '-' (' '\|'\n')
	return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'

	def check_key(self):

	# KEY(flow context): '?'
	if self.flow_level:
	return True

	# KEY(block context): '?' (' '\|'\n')
	else:
	return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'

	def check_value(self):

	# VALUE(flow context): ':'
	if self.flow_level:
	return True

	# VALUE(block context): ':' (' '\|'\n')
	else:
	return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'

	def check_plain(self):

	# A plain scalar may start with any non-space character except:
	# '-', '?', ':', ',', '[', ']', '{', '}',
	# '#', '&', '*', '!', '\|', '>', '\'', '\"',
	# '%', '@', '`'.
	#
	# It may also start with
	# '-', '?', ':'
	# if it is followed by a non-space character.
	#
	# Note that we limit the last rule to the block context (except the
	# '-' character) because we want the flow context to be space
	# independent.
	ch = self.peek()
	return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!\|>\'\"%@`' \
	or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029'
	and (ch == '-' or (not self.flow_level and ch in '?:')))

	# Scanners.

	def scan_to_next_token(self):
	# We ignore spaces, line breaks and comments.
	# If we find a line break in the block context, we set the flag
	# `allow_simple_key` on.
	# The byte order mark is stripped if it's the first character in the
	# stream. We do not yet support BOM inside the stream as the
	# specification requires. Any such mark will be considered as a part
	# of the document.
	#
	# TODO: We need to make tab handling rules more sane. A good rule is
	# Tabs cannot precede tokens
	# BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
	# KEY(block), VALUE(block), BLOCK-ENTRY
	# So the checking code is
	# if <TAB>:
	# self.allow_simple_keys = False
	# We also need to add the check for `allow_simple_keys == True` to
	# `unwind_indent` before issuing BLOCK-END.
	# Scanners for block, flow, and plain scalars need to be modified.

	if self.index == 0 and self.peek() == '\uFEFF':
	self.forward()
	found = False
	while not found:
	while self.peek() == ' ':
	self.forward()
	if self.peek() == '#':
	while self.peek() not in '\0\r\n\x85\u2028\u2029':
	self.forward()
	if self.scan_line_break():
	if not self.flow_level:
	self.allow_simple_key = True
	else:
	found = True

	def scan_directive(self):
	# See the specification for details.
	start_mark = self.get_mark()
	self.forward()
	name = self.scan_directive_name(start_mark)
	value = None
	if name == 'YAML':
	value = self.scan_yaml_directive_value(start_mark)
	end_mark = self.get_mark()
	elif name == 'TAG':
	value = self.scan_tag_directive_value(start_mark)
	end_mark = self.get_mark()
	else:
	end_mark = self.get_mark()
	while self.peek() not in '\0\r\n\x85\u2028\u2029':
	self.forward()
	self.scan_directive_ignored_line(start_mark)
	return DirectiveToken(name, value, start_mark, end_mark)

	def scan_directive_name(self, start_mark):
	# See the specification for details.
	length = 0
	ch = self.peek(length)
	while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
	or ch in '-_':
	length += 1
	ch = self.peek(length)
	if not length:
	raise ScannerError("while scanning a directive", start_mark,
	"expected alphabetic or numeric character, but found %r"
	% ch, self.get_mark())
	value = self.prefix(length)
	self.forward(length)
	ch = self.peek()
	if ch not in '\0 \r\n\x85\u2028\u2029':
	raise ScannerError("while scanning a directive", start_mark,
	"expected alphabetic or numeric character, but found %r"
	% ch, self.get_mark())
	return value

	def scan_yaml_directive_value(self, start_mark):
	# See the specification for details.
	while self.peek() == ' ':
	self.forward()
	major = self.scan_yaml_directive_number(start_mark)
	if self.peek() != '.':
	raise ScannerError("while scanning a directive", start_mark,
	"expected a digit or '.', but found %r" % self.peek(),
	self.get_mark())
	self.forward()
	minor = self.scan_yaml_directive_number(start_mark)
	if self.peek() not in '\0 \r\n\x85\u2028\u2029':
	raise ScannerError("while scanning a directive", start_mark,
	"expected a digit or ' ', but found %r" % self.peek(),
	self.get_mark())
	return (major, minor)

	def scan_yaml_directive_number(self, start_mark):
	# See the specification for details.
	ch = self.peek()
	if not ('0' <= ch <= '9'):
	raise ScannerError("while scanning a directive", start_mark,
	"expected a digit, but found %r" % ch, self.get_mark())
	length = 0
	while '0' <= self.peek(length) <= '9':
	length += 1
	value = int(self.prefix(length))
	self.forward(length)
	return value

	def scan_tag_directive_value(self, start_mark):
	# See the specification for details.
	while self.peek() == ' ':
	self.forward()
	handle = self.scan_tag_directive_handle(start_mark)
	while self.peek() == ' ':
	self.forward()
	prefix = self.scan_tag_directive_prefix(start_mark)
	return (handle, prefix)

	def scan_tag_directive_handle(self, start_mark):
	# See the specification for details.
	value = self.scan_tag_handle('directive', start_mark)
	ch = self.peek()
	if ch != ' ':
	raise ScannerError("while scanning a directive", start_mark,
	"expected ' ', but found %r" % ch, self.get_mark())
	return value

	def scan_tag_directive_prefix(self, start_mark):
	# See the specification for details.
	value = self.scan_tag_uri('directive', start_mark)
	ch = self.peek()
	if ch not in '\0 \r\n\x85\u2028\u2029':
	raise ScannerError("while scanning a directive", start_mark,
	"expected ' ', but found %r" % ch, self.get_mark())
	return value

	def scan_directive_ignored_line(self, start_mark):
	# See the specification for details.
	while self.peek() == ' ':
	self.forward()
	if self.peek() == '#':
	while self.peek() not in '\0\r\n\x85\u2028\u2029':
	self.forward()
	ch = self.peek()
	if ch not in '\0\r\n\x85\u2028\u2029':
	raise ScannerError("while scanning a directive", start_mark,
	"expected a comment or a line break, but found %r"
	% ch, self.get_mark())
	self.scan_line_break()

	def scan_anchor(self, TokenClass):
	# The specification does not restrict characters for anchors and
	# aliases. This may lead to problems, for instance, the document:
	# [ *alias, value ]
	# can be interpreted in two ways, as
	# [ "value" ]
	# and
	# [ *alias , "value" ]
	# Therefore we restrict aliases to numbers and ASCII letters.
	start_mark = self.get_mark()
	indicator = self.peek()
	if indicator == '*':
	name = 'alias'
	else:
	name = 'anchor'
	self.forward()
	length = 0
	ch = self.peek(length)
	while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
	or ch in '-_':
	length += 1
	ch = self.peek(length)
	if not length:
	raise ScannerError("while scanning an %s" % name, start_mark,
	"expected alphabetic or numeric character, but found %r"
	% ch, self.get_mark())
	value = self.prefix(length)
	self.forward(length)
	ch = self.peek()
	if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
	raise ScannerError("while scanning an %s" % name, start_mark,
	"expected alphabetic or numeric character, but found %r"
	% ch, self.get_mark())
	end_mark = self.get_mark()
	return TokenClass(value, start_mark, end_mark)

	def scan_tag(self):
	# See the specification for details.
	start_mark = self.get_mark()
	ch = self.peek(1)
	if ch == '<':
	handle = None
	self.forward(2)
	suffix = self.scan_tag_uri('tag', start_mark)
	if self.peek() != '>':
	raise ScannerError("while parsing a tag", start_mark,
	"expected '>', but found %r" % self.peek(),
	self.get_mark())
	self.forward()
	elif ch in '\0 \t\r\n\x85\u2028\u2029':
	handle = None
	suffix = '!'
	self.forward()
	else:
	length = 1
	use_handle = False
	while ch not in '\0 \r\n\x85\u2028\u2029':
	if ch == '!':
	use_handle = True
	break
	length += 1
	ch = self.peek(length)
	handle = '!'
	if use_handle:
	handle = self.scan_tag_handle('tag', start_mark)
	else:
	handle = '!'
	self.forward()
	suffix = self.scan_tag_uri('tag', start_mark)
	ch = self.peek()
	if ch not in '\0 \r\n\x85\u2028\u2029':
	raise ScannerError("while scanning a tag", start_mark,
	"expected ' ', but found %r" % ch, self.get_mark())
	value = (handle, suffix)
	end_mark = self.get_mark()
	return TagToken(value, start_mark, end_mark)

	def scan_block_scalar(self, style):
	# See the specification for details.

	if style == '>':
	folded = True
	else:
	folded = False

	chunks = []
	start_mark = self.get_mark()

	# Scan the header.
	self.forward()
	chomping, increment = self.scan_block_scalar_indicators(start_mark)
	self.scan_block_scalar_ignored_line(start_mark)

	# Determine the indentation level and go to the first non-empty line.
	min_indent = self.indent+1
	if min_indent < 1:
	min_indent = 1
	if increment is None:
	breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
	indent = max(min_indent, max_indent)
	else:
	indent = min_indent+increment-1
	breaks, end_mark = self.scan_block_scalar_breaks(indent)
	line_break = ''

	# Scan the inner part of the block scalar.
	while self.column == indent and self.peek() != '\0':
	chunks.extend(breaks)
	leading_non_space = self.peek() not in ' \t'
	length = 0
	while self.peek(length) not in '\0\r\n\x85\u2028\u2029':
	length += 1
	chunks.append(self.prefix(length))
	self.forward(length)
	line_break = self.scan_line_break()
	breaks, end_mark = self.scan_block_scalar_breaks(indent)
	if self.column == indent and self.peek() != '\0':

	# Unfortunately, folding rules are ambiguous.
	#
	# This is the folding according to the specification:

	if folded and line_break == '\n' \
	and leading_non_space and self.peek() not in ' \t':
	if not breaks:
	chunks.append(' ')
	else:
	chunks.append(line_break)

	# This is Clark Evans's interpretation (also in the spec
	# examples):
	#
	#if folded and line_break == '\n':
	# if not breaks:
	# if self.peek() not in ' \t':
	# chunks.append(' ')
	# else:
	# chunks.append(line_break)
	#else:
	# chunks.append(line_break)
	else:
	break

	# Chomp the tail.
	if chomping is not False:
	chunks.append(line_break)
	if chomping is True:
	chunks.extend(breaks)

	# We are done.
	return ScalarToken(''.join(chunks), False, start_mark, end_mark,
	style)

	def scan_block_scalar_indicators(self, start_mark):
	# See the specification for details.
	chomping = None
	increment = None
	ch = self.peek()
	if ch in '+-':
	if ch == '+':
	chomping = True
	else:
	chomping = False
	self.forward()
	ch = self.peek()
	if ch in '0123456789':
	increment = int(ch)
	if increment == 0:
	raise ScannerError("while scanning a block scalar", start_mark,
	"expected indentation indicator in the range 1-9, but found 0",
	self.get_mark())
	self.forward()
	elif ch in '0123456789':
	increment = int(ch)
	if increment == 0:
	raise ScannerError("while scanning a block scalar", start_mark,
	"expected indentation indicator in the range 1-9, but found 0",
	self.get_mark())
	self.forward()
	ch = self.peek()
	if ch in '+-':
	if ch == '+':
	chomping = True
	else:
	chomping = False
	self.forward()
	ch = self.peek()
	if ch not in '\0 \r\n\x85\u2028\u2029':
	raise ScannerError("while scanning a block scalar", start_mark,
	"expected chomping or indentation indicators, but found %r"
	% ch, self.get_mark())
	return chomping, increment

	def scan_block_scalar_ignored_line(self, start_mark):
	# See the specification for details.
	while self.peek() == ' ':
	self.forward()
	if self.peek() == '#':
	while self.peek() not in '\0\r\n\x85\u2028\u2029':
	self.forward()
	ch = self.peek()
	if ch not in '\0\r\n\x85\u2028\u2029':
	raise ScannerError("while scanning a block scalar", start_mark,
	"expected a comment or a line break, but found %r" % ch,
	self.get_mark())
	self.scan_line_break()

	def scan_block_scalar_indentation(self):
	# See the specification for details.
	chunks = []
	max_indent = 0
	end_mark = self.get_mark()
	while self.peek() in ' \r\n\x85\u2028\u2029':
	if self.peek() != ' ':
	chunks.append(self.scan_line_break())
	end_mark = self.get_mark()
	else:
	self.forward()
	if self.column > max_indent:
	max_indent = self.column
	return chunks, max_indent, end_mark

	def scan_block_scalar_breaks(self, indent):
	# See the specification for details.
	chunks = []
	end_mark = self.get_mark()
	while self.column < indent and self.peek() == ' ':
	self.forward()
	while self.peek() in '\r\n\x85\u2028\u2029':
	chunks.append(self.scan_line_break())
	end_mark = self.get_mark()
	while self.column < indent and self.peek() == ' ':
	self.forward()
	return chunks, end_mark

	def scan_flow_scalar(self, style):
	# See the specification for details.
	# Note that we loose indentation rules for quoted scalars. Quoted
	# scalars don't need to adhere indentation because " and ' clearly
	# mark the beginning and the end of them. Therefore we are less
	# restrictive then the specification requires. We only need to check
	# that document separators are not included in scalars.
	if style == '"':
	double = True
	else:
	double = False
	chunks = []
	start_mark = self.get_mark()
	quote = self.peek()
	self.forward()
	chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
	while self.peek() != quote:
	chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
	chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
	self.forward()
	end_mark = self.get_mark()
	return ScalarToken(''.join(chunks), False, start_mark, end_mark,
	style)

	ESCAPE_REPLACEMENTS = {
	'0': '\0',
	'a': '\x07',
	'b': '\x08',
	't': '\x09',
	'\t': '\x09',
	'n': '\x0A',
	'v': '\x0B',
	'f': '\x0C',
	'r': '\x0D',
	'e': '\x1B',
	' ': '\x20',
	'\"': '\"',
	'\\': '\\',
	'/': '/',
	'N': '\x85',
	'_': '\xA0',
	'L': '\u2028',
	'P': '\u2029',
	}

	ESCAPE_CODES = {
	'x': 2,
	'u': 4,
	'U': 8,
	}

	def scan_flow_scalar_non_spaces(self, double, start_mark):
	# See the specification for details.
	chunks = []
	while True:
	length = 0
	while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029':
	length += 1
	if length:
	chunks.append(self.prefix(length))
	self.forward(length)
	ch = self.peek()
	if not double and ch == '\'' and self.peek(1) == '\'':
	chunks.append('\'')
	self.forward(2)
	elif (double and ch == '\'') or (not double and ch in '\"\\'):
	chunks.append(ch)
	self.forward()
	elif double and ch == '\\':
	self.forward()
	ch = self.peek()
	if ch in self.ESCAPE_REPLACEMENTS:
	chunks.append(self.ESCAPE_REPLACEMENTS[ch])
	self.forward()
	elif ch in self.ESCAPE_CODES:
	length = self.ESCAPE_CODES[ch]
	self.forward()
	for k in range(length):
	if self.peek(k) not in '0123456789ABCDEFabcdef':
	raise ScannerError("while scanning a double-quoted scalar", start_mark,
	"expected escape sequence of %d hexadecimal numbers, but found %r" %
	(length, self.peek(k)), self.get_mark())
	code = int(self.prefix(length), 16)
	chunks.append(chr(code))
	self.forward(length)
	elif ch in '\r\n\x85\u2028\u2029':
	self.scan_line_break()
	chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
	else:
	raise ScannerError("while scanning a double-quoted scalar", start_mark,
	"found unknown escape character %r" % ch, self.get_mark())
	else:
	return chunks

	def scan_flow_scalar_spaces(self, double, start_mark):
	# See the specification for details.
	chunks = []
	length = 0
	while self.peek(length) in ' \t':
	length += 1
	whitespaces = self.prefix(length)
	self.forward(length)
	ch = self.peek()
	if ch == '\0':
	raise ScannerError("while scanning a quoted scalar", start_mark,
	"found unexpected end of stream", self.get_mark())
	elif ch in '\r\n\x85\u2028\u2029':
	line_break = self.scan_line_break()
	breaks = self.scan_flow_scalar_breaks(double, start_mark)
	if line_break != '\n':
	chunks.append(line_break)
	elif not breaks:
	chunks.append(' ')
	chunks.extend(breaks)
	else:
	chunks.append(whitespaces)
	return chunks

	def scan_flow_scalar_breaks(self, double, start_mark):
	# See the specification for details.
	chunks = []
	while True:
	# Instead of checking indentation, we check for document
	# separators.
	prefix = self.prefix(3)
	if (prefix == '---' or prefix == '...') \
	and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
	raise ScannerError("while scanning a quoted scalar", start_mark,
	"found unexpected document separator", self.get_mark())
	while self.peek() in ' \t':
	self.forward()
	if self.peek() in '\r\n\x85\u2028\u2029':
	chunks.append(self.scan_line_break())
	else:
	return chunks

	def scan_plain(self):
	# See the specification for details.
	# We add an additional restriction for the flow context:
	# plain scalars in the flow context cannot contain ',' or '?'.
	# We also keep track of the `allow_simple_key` flag here.
	# Indentation rules are loosed for the flow context.
	chunks = []
	start_mark = self.get_mark()
	end_mark = start_mark
	indent = self.indent+1
	# We allow zero indentation for scalars, but then we need to check for
	# document separators at the beginning of the line.
	#if indent == 0:
	# indent = 1
	spaces = []
	while True:
	length = 0
	if self.peek() == '#':
	break
	while True:
	ch = self.peek(length)
	if ch in '\0 \t\r\n\x85\u2028\u2029' \
	or (ch == ':' and
	self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029'
	+ (u',[]{}' if self.flow_level else u''))\
	or (self.flow_level and ch in ',?[]{}'):
	break
	length += 1
	if length == 0:
	break
	self.allow_simple_key = False
	chunks.extend(spaces)
	chunks.append(self.prefix(length))
	self.forward(length)
	end_mark = self.get_mark()
	spaces = self.scan_plain_spaces(indent, start_mark)
	if not spaces or self.peek() == '#' \
	or (not self.flow_level and self.column < indent):
	break
	return ScalarToken(''.join(chunks), True, start_mark, end_mark)

	def scan_plain_spaces(self, indent, start_mark):
	# See the specification for details.
	# The specification is really confusing about tabs in plain scalars.
	# We just forbid them completely. Do not use tabs in YAML!
	chunks = []
	length = 0
	while self.peek(length) in ' ':
	length += 1
	whitespaces = self.prefix(length)
	self.forward(length)
	ch = self.peek()
	if ch in '\r\n\x85\u2028\u2029':
	line_break = self.scan_line_break()
	self.allow_simple_key = True
	prefix = self.prefix(3)
	if (prefix == '---' or prefix == '...') \
	and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
	return
	breaks = []
	while self.peek() in ' \r\n\x85\u2028\u2029':
	if self.peek() == ' ':
	self.forward()
	else:
	breaks.append(self.scan_line_break())
	prefix = self.prefix(3)
	if (prefix == '---' or prefix == '...') \
	and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
	return
	if line_break != '\n':
	chunks.append(line_break)
	elif not breaks:
	chunks.append(' ')
	chunks.extend(breaks)
	elif whitespaces:
	chunks.append(whitespaces)
	return chunks

	def scan_tag_handle(self, name, start_mark):
	# See the specification for details.
	# For some strange reasons, the specification does not allow '_' in
	# tag handles. I have allowed it anyway.
	ch = self.peek()
	if ch != '!':
	raise ScannerError("while scanning a %s" % name, start_mark,
	"expected '!', but found %r" % ch, self.get_mark())
	length = 1
	ch = self.peek(length)
	if ch != ' ':
	while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
	or ch in '-_':
	length += 1
	ch = self.peek(length)
	if ch != '!':
	self.forward(length)
	raise ScannerError("while scanning a %s" % name, start_mark,
	"expected '!', but found %r" % ch, self.get_mark())
	length += 1
	value = self.prefix(length)
	self.forward(length)
	return value

	def scan_tag_uri(self, name, start_mark):
	# See the specification for details.
	# Note: we do not check if URI is well-formed.
	chunks = []
	length = 0
	ch = self.peek(length)
	while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
	or ch in '-;/?:@&=+$,_.!~*\'()[]%':
	if ch == '%':
	chunks.append(self.prefix(length))
	self.forward(length)
	length = 0
	chunks.append(self.scan_uri_escapes(name, start_mark))
	else:
	length += 1
	ch = self.peek(length)
	if length:
	chunks.append(self.prefix(length))
	self.forward(length)
	length = 0
	if not chunks:
	raise ScannerError("while parsing a %s" % name, start_mark,
	"expected URI, but found %r" % ch, self.get_mark())
	return ''.join(chunks)

	def scan_uri_escapes(self, name, start_mark):
	# See the specification for details.
	codes = []
	mark = self.get_mark()
	while self.peek() == '%':
	self.forward()
	for k in range(2):
	if self.peek(k) not in '0123456789ABCDEFabcdef':
	raise ScannerError("while scanning a %s" % name, start_mark,
	"expected URI escape sequence of 2 hexadecimal numbers, but found %r"
	% self.peek(k), self.get_mark())
	codes.append(int(self.prefix(2), 16))
	self.forward(2)
	try:
	value = bytes(codes).decode('utf-8')
	except UnicodeDecodeError as exc:
	raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
	return value

	def scan_line_break(self):
	# Transforms:
	# '\r\n' : '\n'
	# '\r' : '\n'
	# '\n' : '\n'
	# '\x85' : '\n'
	# '\u2028' : '\u2028'
	# '\u2029 : '\u2029'
	# default : ''
	ch = self.peek()
	if ch in '\r\n\x85':
	if self.prefix(2) == '\r\n':
	self.forward(2)
	else:
	self.forward()
	return '\n'
	elif ch in '\u2028\u2029':
	self.forward()
	return ch
	return ''