Spaces:

ProjectFrozone
/

Site

Running

App Files Files Community

Site / src /text_corruption /corruptor.py

GalaxyTab

Added Frozone Stuff

40a04d4 24 days ago

raw

history blame contribute delete

3.56 kB

	#!/usr/bin/env python3
	"""
	corrupter.py - light English typo/misspelling injection for to simulate
	"human-typed" text.

	This module provides `corrupt(text, ...)`, which returns the original text with
	a small amount of realistic noise (common misspellings and keyboard typos). See
	function docstring for arguments and explanations.
	"""
	import re
	import argparse
	import random

	import nlpaug.augmenter.word as naw
	import nlpaug.augmenter.char as nac
	from nlpaug.flow import Sequential


	def parse_args():
	parser = argparse.ArgumentParser(
	description="Interactive text corrupter.")
	parser.add_argument(
	"--misspelling-prob",
	type=float,
	default=0.04,
	help="Fraction of words to misspell (roughly)"
	)
	parser.add_argument(
	"--typo-prob",
	type=float,
	default=0.01,
	help="Fraction of words to finger fart"
	)
	parser.add_argument(
	"--min-len",
	type=int,
	default=3,
	help="Minimum length word to possibly corrupt"
	)
	parser.add_argument(
	"--verbose",
	action="store_true",
	help="If True, print words and corruptions (if any)."
	)
	return parser.parse_args()


	def corrupt(
	text: str,
	misspell_prob: float = 0.04,
	typo_prob: float = 0.01,
	min_len: int = 3,
	misspell_aug_p: float = 0.02,
	typo_aug_p: float = 0.001,
	verbose: bool = False
	) -> str:
	"""
	Modestly corrupt the text passed to get a version with a small number of
	misspellings and typos.

	misspell_prob: Probability of each word (of sufficient length) being
	misspelled.
	typo_prob: Probability of each word (of sufficient length) having a typo.
	min_len: The minimum length word that will be considered for corruption.
	misspell_aug_p: Passed through directly to the nlpaug.augmenter. From their
	docs, it seems to mean "for words chosen for misspelling, what
	percentage of the word is misspelled?"
	typo_aug_p: Passed through directly to the nlpaug.augmenter. From their
	docs, it seems to mean "for words chosen for typos, what percentage of
	the word will have typos?"
	verbose: if True, print words as encountered, plus their corruptions (if
	any).
	"""

	TOKEN_RE = re.compile(
	r"[A-Za-z0-9]+(?:'[A-Za-z0-9]+)?\|\s+\|[^\w\s]",
	re.UNICODE
	)
	WORD_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z0-9]+)?", re.UNICODE)

	misspell_aug = naw.SpellingAug(aug_p=misspell_aug_p)
	typo_aug = nac.KeyboardAug(aug_word_p=typo_aug_p)

	tokens = TOKEN_RE.findall(text)
	for i in range(len(tokens)):
	if verbose: print(f"Considering {tokens[i]}...")
	if WORD_RE.fullmatch(tokens[i]):
	if len(tokens[i]) >= min_len and random.random() < misspell_prob:
	tokens[i] = misspell_aug.augment(tokens[i])[0]
	if verbose: print(f" ...misspelled to {tokens[i]}")
	if len(tokens[i]) >= min_len and random.random() < typo_prob:
	tokens[i] = typo_aug.augment(tokens[i])[0]
	if verbose: print(f" ...corrupted to {tokens[i]}")

	return "".join(tokens)


	if __name__ == "__main__":

	random.seed(123)

	args = parse_args()

	s = input("Enter text: ")
	while s != "done":
	corrupted = corrupt(
	s,
	args.misspelling_prob,
	args.typo_prob,
	args.min_len,
	verbose=args.verbose,
	)
	print(f"Corrupted version: {corrupted}")
	s = input("Enter text: ")