rewrite / src /preprocessing /dyslexia_simulator.py

Add files using upload-large-folder tool

12fd5f2 verified 10 days ago

5.11 kB

	"""
	Programmatically generates dyslectic training data from clean text.
	Used to augment training pairs when real dyslectic examples are scarce.

	Error types simulated (from Rello et al. 2013, 2017 dyslexia research):
	- Phonetic substitution (most common, ~35% of errors)
	- Letter transposition (e.g., "teh" for "the") (~18%)
	- Letter omission (~16%)
	- Letter doubling (~12%)
	- Letter reversal b/d, p/q (~10%)
	- Word boundary errors (~9%)
	"""

	import random
	import re
	from typing import Tuple


	class DyslexiaSimulator:
	"""Generates synthetic dyslectic text from clean input for data augmentation."""

	LETTER_REVERSALS = {'b': 'd', 'd': 'b', 'p': 'q', 'q': 'p', 'n': 'u', 'u': 'n'}
	PHONETIC_SUBS = {
	'was': 'wuz', 'could': 'cud', 'would': 'wud', 'they': 'thay',
	'because': 'becaus', 'important': 'importnt', 'receive': 'recieve',
	'believe': 'beleive', 'definitely': 'definately', 'separate': 'seperate',
	'a lot': 'alot', 'in fact': 'infact', 'as well': 'aswell',
	}
	WORD_MERGES = [
	('a lot', 'alot'), ('in fact', 'infact'), ('as well', 'aswell'),
	('all right', 'alright'), ('every one', 'everyone'),
	]

	def __init__(self, error_rate: float = 0.15, seed: int = 42):
	self.error_rate = error_rate
	self.rng = random.Random(seed)

	def _transpose_letters(self, word: str) -> str:
	"""Swap two adjacent letters."""
	if len(word) < 3:
	return word
	# Pick a random position in interior of word (not first/last)
	idx = self.rng.randint(1, len(word) - 2)
	chars = list(word)
	chars[idx], chars[idx + 1] = chars[idx + 1], chars[idx]
	return ''.join(chars)

	def _omit_letter(self, word: str) -> str:
	"""Remove a random interior letter."""
	if len(word) < 4:
	return word
	idx = self.rng.randint(1, len(word) - 2)
	return word[:idx] + word[idx + 1:]

	def _double_letter(self, word: str) -> str:
	"""Double a random interior letter."""
	if len(word) < 3:
	return word
	idx = self.rng.randint(1, len(word) - 2)
	return word[:idx] + word[idx] + word[idx:]

	def _reverse_letter(self, word: str) -> str:
	"""Swap b/d, p/q style reversals."""
	chars = list(word)
	reversed_any = False
	for i, c in enumerate(chars):
	if c.lower() in self.LETTER_REVERSALS:
	replacement = self.LETTER_REVERSALS[c.lower()]
	# Preserve case
	chars[i] = replacement.upper() if c.isupper() else replacement
	reversed_any = True
	break # Only reverse one letter per word
	if reversed_any:
	return ''.join(chars)
	return word

	def corrupt_word(self, word: str) -> str:
	"""Apply a single random error to a word."""
	if len(word) < 3:
	return word
	# Check for phonetic substitution first
	lower = word.lower()
	if lower in self.PHONETIC_SUBS and self.rng.random() < 0.35:
	sub = self.PHONETIC_SUBS[lower]
	return sub.capitalize() if word[0].isupper() else sub

	# Weighted random error selection matching research distributions
	error_type = self.rng.choices(
	['transpose', 'omit', 'double', 'reverse'],
	weights=[0.35, 0.30, 0.20, 0.15],
	k=1
	)[0]

	if error_type == 'transpose':
	return self._transpose_letters(word)
	elif error_type == 'omit':
	return self._omit_letter(word)
	elif error_type == 'double':
	return self._double_letter(word)
	else:
	return self._reverse_letter(word)

	def simulate(self, clean_text: str) -> Tuple[str, str]:
	"""Returns (corrupted_text, clean_text) training pair."""
	if not clean_text or not clean_text.strip():
	return (clean_text, clean_text)

	# First, apply word merge errors at phrase level
	corrupted = clean_text
	for original_phrase, merged in self.WORD_MERGES:
	if original_phrase in corrupted.lower() and self.rng.random() < self.error_rate:
	# Case-insensitive replacement
	pattern = re.compile(re.escape(original_phrase), re.IGNORECASE)
	corrupted = pattern.sub(merged, corrupted, count=1)

	# Then corrupt individual words
	words = corrupted.split()
	corrupted_words = []
	for word in words:
	# Strip trailing punctuation for corruption, reattach after
	stripped = word.rstrip(".,!?;:\"'()[]{}—–-")
	suffix = word[len(stripped):]

	if (len(stripped) >= 3 and
	self.rng.random() < self.error_rate and
	stripped.isalpha()):
	corrupted_word = self.corrupt_word(stripped)
	corrupted_words.append(corrupted_word + suffix)
	else:
	corrupted_words.append(word)

	corrupted = ' '.join(corrupted_words)
	return (corrupted, clean_text)