Upload folder using huggingface_hub

f39d337 verified 7 months ago

7.73 kB

	"""
	Parser for parsing a regular expression.
	Take a string representing a regular expression and return the root node of its
	parse tree.

	usage::

	root_node = parse_regex('(hello\|world)')

	Remarks:
	- The regex parser processes multiline, it ignores all whitespace and supports
	multiple named groups with the same name and #-style comments.

	Limitations:
	- Lookahead is not supported.
	"""

	from __future__ import annotations

	import re

	__all__ = [
	"Repeat",
	"Variable",
	"Regex",
	"Lookahead",
	"tokenize_regex",
	"parse_regex",
	]


	class Node:
	"""
	Base class for all the grammar nodes.
	(You don't initialize this one.)
	"""

	def __add__(self, other_node: Node) -> NodeSequence:
	return NodeSequence([self, other_node])

	def __or__(self, other_node: Node) -> AnyNode:
	return AnyNode([self, other_node])


	class AnyNode(Node):
	"""
	Union operation (OR operation) between several grammars. You don't
	initialize this yourself, but it's a result of a "Grammar1 \| Grammar2"
	operation.
	"""

	def __init__(self, children: list[Node]) -> None:
	self.children = children

	def __or__(self, other_node: Node) -> AnyNode:
	return AnyNode(self.children + [other_node])

	def __repr__(self) -> str:
	return f"{self.__class__.__name__}({self.children!r})"


	class NodeSequence(Node):
	"""
	Concatenation operation of several grammars. You don't initialize this
	yourself, but it's a result of a "Grammar1 + Grammar2" operation.
	"""

	def __init__(self, children: list[Node]) -> None:
	self.children = children

	def __add__(self, other_node: Node) -> NodeSequence:
	return NodeSequence(self.children + [other_node])

	def __repr__(self) -> str:
	return f"{self.__class__.__name__}({self.children!r})"


	class Regex(Node):
	"""
	Regular expression.
	"""

	def __init__(self, regex: str) -> None:
	re.compile(regex) # Validate

	self.regex = regex

	def __repr__(self) -> str:
	return f"{self.__class__.__name__}(/{self.regex}/)"


	class Lookahead(Node):
	"""
	Lookahead expression.
	"""

	def __init__(self, childnode: Node, negative: bool = False) -> None:
	self.childnode = childnode
	self.negative = negative

	def __repr__(self) -> str:
	return f"{self.__class__.__name__}({self.childnode!r})"


	class Variable(Node):
	"""
	Mark a variable in the regular grammar. This will be translated into a
	named group. Each variable can have his own completer, validator, etc..

	:param childnode: The grammar which is wrapped inside this variable.
	:param varname: String.
	"""

	def __init__(self, childnode: Node, varname: str = "") -> None:
	self.childnode = childnode
	self.varname = varname

	def __repr__(self) -> str:
	return f"{self.__class__.__name__}(childnode={self.childnode!r}, varname={self.varname!r})"


	class Repeat(Node):
	def __init__(
	self,
	childnode: Node,
	min_repeat: int = 0,
	max_repeat: int \| None = None,
	greedy: bool = True,
	) -> None:
	self.childnode = childnode
	self.min_repeat = min_repeat
	self.max_repeat = max_repeat
	self.greedy = greedy

	def __repr__(self) -> str:
	return f"{self.__class__.__name__}(childnode={self.childnode!r})"


	def tokenize_regex(input: str) -> list[str]:
	"""
	Takes a string, representing a regular expression as input, and tokenizes
	it.

	:param input: string, representing a regular expression.
	:returns: List of tokens.
	"""
	# Regular expression for tokenizing other regular expressions.
	p = re.compile(
	r"""^(
	\(\?P\<[a-zA-Z0-9_-]+\> \| # Start of named group.
	\(\?#[^)]*\) \| # Comment
	\(\?= \| # Start of lookahead assertion
	\(\?! \| # Start of negative lookahead assertion
	\(\?<= \| # If preceded by.
	\(\?< \| # If not preceded by.
	\(?: \| # Start of group. (non capturing.)
	\( \| # Start of group.
	\(?[iLmsux] \| # Flags.
	\(?P=[a-zA-Z]+\) \| # Back reference to named group
	\) \| # End of group.
	\{[^{}]*\} \| # Repetition
	\*\? \| \+\? \| \?\?\ \| # Non greedy repetition.
	\* \| \+ \| \? \| # Repetition
	\#.*\n \| # Comment
	\\. \|

	# Character group.
	\[
	( [^\]\\] \| \\.)*
	\] \|

	[^(){}] \|
	.
	)""",
	re.VERBOSE,
	)

	tokens = []

	while input:
	m = p.match(input)
	if m:
	token, input = input[: m.end()], input[m.end() :]
	if not token.isspace():
	tokens.append(token)
	else:
	raise Exception("Could not tokenize input regex.")

	return tokens


	def parse_regex(regex_tokens: list[str]) -> Node:
	"""
	Takes a list of tokens from the tokenizer, and returns a parse tree.
	"""
	# We add a closing brace because that represents the final pop of the stack.
	tokens: list[str] = [")"] + regex_tokens[::-1]

	def wrap(lst: list[Node]) -> Node:
	"""Turn list into sequence when it contains several items."""
	if len(lst) == 1:
	return lst[0]
	else:
	return NodeSequence(lst)

	def _parse() -> Node:
	or_list: list[list[Node]] = []
	result: list[Node] = []

	def wrapped_result() -> Node:
	if or_list == []:
	return wrap(result)
	else:
	or_list.append(result)
	return AnyNode([wrap(i) for i in or_list])

	while tokens:
	t = tokens.pop()

	if t.startswith("(?P<"):
	variable = Variable(_parse(), varname=t[4:-1])
	result.append(variable)

	elif t in ("", "?"):
	greedy = t == "*"
	result[-1] = Repeat(result[-1], greedy=greedy)

	elif t in ("+", "+?"):
	greedy = t == "+"
	result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)

	elif t in ("?", "??"):
	if result == []:
	raise Exception("Nothing to repeat." + repr(tokens))
	else:
	greedy = t == "?"
	result[-1] = Repeat(
	result[-1], min_repeat=0, max_repeat=1, greedy=greedy
	)

	elif t == "\|":
	or_list.append(result)
	result = []

	elif t in ("(", "(?:"):
	result.append(_parse())

	elif t == "(?!":
	result.append(Lookahead(_parse(), negative=True))

	elif t == "(?=":
	result.append(Lookahead(_parse(), negative=False))

	elif t == ")":
	return wrapped_result()

	elif t.startswith("#"):
	pass

	elif t.startswith("{"):
	# TODO: implement!
	raise Exception(f"{t}-style repetition not yet supported")

	elif t.startswith("(?"):
	raise Exception(f"{t!r} not supported")

	elif t.isspace():
	pass
	else:
	result.append(Regex(t))

	raise Exception("Expecting ')' token")

	result = _parse()

	if len(tokens) != 0:
	raise Exception("Unmatched parentheses.")
	else:
	return result