veryfansome
/

ner-explorer

Model card Files Files and versions

ner-explorer / util.py

veryfansome's picture

feat: functional CLI editor

051eb53 11 months ago

history blame contribute delete

1.76 kB

	import re

	non_terminal_periods = (
	r"(?<!\sApt)"
	r"(?<!\sBlvd)"
	r"(?<!\sCapt)"
	r"(?<!\sDr)"
	r"(?<!\sJr)"
	r"(?<!\sMr)"
	r"(?<!\sMrs)"
	r"(?<!\sMs)"
	r"(?<!\sPh\.D)"
	r"(?<!\sRd)"
	r"(?<!\sSr)"
	r"(?<!\sSt)"
	r"(?<!\se\.g)"
	r"(?<!\setc)"
	r"(?<!\si\.e)"
	r"(?<!\slit)"
	r"(?<!\s[A-Z])"
	r"(?<!\(r)"
	r"(?<!^[a-zA-Z0-9])"
	)

	naive_sentence_end_pattern = re.compile(r"([\n\r]+"
	r"\|[!?]+\"?(?=\s\|$)"
	r"\|" + non_terminal_periods + r"\.+\"?(?=\s\|$))")
	# Option 1:
	# [\n\r]+ - Match consecutive newline and carriage returns
	# Option 2:
	# [!?]+ - Match ! or ?
	# (?=\s\|$) - Must be followed by \s or end-of-string
	# Option 3:
	# non_terminal_periods - Must not be preceded by non-terminal characters
	# \.+ - Match .
	# (?=\s\|$) - Must be followed by \s or end-of-string

	naive_tokenize_pattern = re.compile(
	r"("
	r"\s+"
	r"\|-+(?=\s\|$)"
	r"\|(?<=\s)-+"
	r"\|-{2,}"
	r"\|–+"
	r"\|—+"
	r"\|(?<=[a-z])n’t(?=\s\|$)"
	r"\|(?<=[a-z])n't(?=\s\|$)"
	r"\|’[a-s,u-z]+(?=\s\|$)"
	r"\|'[a-s,u-z]+(?=\s\|$)"
	r"\|’+"
	r"\|'+"
	r"\|\"+"
	r"\|`+"
	r"\|,+(?=\"\|\s\|$)"
	r"\|" + non_terminal_periods + r"\.+(?=\"\|\s\|$)"
	r"\|:+"
	r"\|;+"
	r"\|[?!]+(?=\"\|\s\|$)"
	r"\|\(+"
	r"\|\)+"
	r"\|\[+"
	r"\|]+"
	r"\|\{+"
	r"\|}+"
	r"\|<+"
	r"\|>+"
	r"\|[\u4e00-\u9fff]" # For Chinese characters, which are not space delimited
	r")"
	)


	def naive_tokenize(text: str):
	return [t for t in naive_tokenize_pattern.split(text)
	if t != ""
	and not t.startswith(" ")
	and not t.startswith("\t")]