Humanizer-try / enhanced_text_humanizer.py
VarunRavichander's picture
Update enhanced_text_humanizer.py
6fd1367 verified
import nltk
import random
import re
import spacy
import numpy as np
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from collections import defaultdict
from transformers import pipeline
spacy.load('en_core_web_sm')
# With this code:
try:
nltk.data.find('tokenizers/punkt')
nltk.data.find('taggers/averaged_perceptron_tagger')
nltk.data.find('corpora/wordnet')
except LookupError as e:
print(f"NLTK resource error: {e}")
print("Attempting to download missing resources...")
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
class EnhancedTextHumanizer:
def __init__(self):
# Load spaCy model
self.nlp = spacy.load("en_core_web_sm")
# Initialize sentiment analysis pipeline - for emotional variations
try:
self.sentiment_analyzer = pipeline("sentiment-analysis")
except:
print("Warning: Transformers sentiment analysis not available. Emotional variations will be limited.")
self.sentiment_analyzer = None
# Sentence-level transformations
self.sentence_transformations = [
self.merge_sentences,
self.split_sentences,
self.passive_to_active,
self.active_to_passive,
self.add_hedging,
self.add_intensifiers,
self.add_rhetorical_question,
self.add_aside_comment
]
# Word-level transformations
self.word_transformations = [
self.contextual_synonym_replacement,
self.contraction_expansion,
self.add_filler_words,
self.informal_substitution,
self.add_emphatic_repetition
]
# Paragraph-level transformations
self.paragraph_transformations = [
self.add_discourse_markers,
self.adjust_formality,
self.add_cohesion_devices
]
# Filler words with categorization
self.filler_words = {
'hesitation': ["um", "uh", "er", "hmm", "like"],
'emphasis': ["actually", "literally", "seriously", "honestly", "truly", "really"],
'hedging': ["maybe", "perhaps", "probably", "possibly", "somewhat", "kinda", "sort of"],
'clarification': ["I mean", "you know", "what I'm saying", "in other words"],
'informal': ["basically", "totally", "absolutely", "pretty much", "y'know"]
}
# Contractions and their expansions
self.contractions = {
"can't": "cannot", "won't": "will not", "don't": "do not",
"doesn't": "does not", "I'm": "I am", "you're": "you are",
"they're": "they are", "we're": "we are", "it's": "it is",
"that's": "that is", "who's": "who is", "what's": "what is",
"there's": "there is", "here's": "here is", "he's": "he is",
"she's": "she is", "I've": "I have", "you've": "you have",
"we've": "we have", "they've": "they have", "I'll": "I will",
"you'll": "you will", "he'll": "he will", "she'll": "she will",
"we'll": "we will", "they'll": "they will", "I'd": "I would",
"you'd": "you would", "he'd": "he would", "she'd": "she would",
"we'd": "we would", "they'd": "they would", "let's": "let us",
"ain't": "am not", "wasn't": "was not", "weren't": "were not",
"hasn't": "has not", "haven't": "have not", "couldn't": "could not",
"shouldn't": "should not", "wouldn't": "would not", "didn't": "did not",
"isn't": "is not", "aren't": "are not", "mightn't": "might not",
"mustn't": "must not", "shan't": "shall not", "needn't": "need not"
}
# Reverse contractions dictionary
self.expansions = {v: k for k, v in self.contractions.items()}
# Common hedging phrases
self.hedging_phrases = [
"I think", "It seems", "It appears", "From what I understand",
"As far as I know", "In my opinion", "Arguably", "Presumably",
"It could be that", "It's possible that", "One might say",
"To some extent", "More or less", "Kind of", "Sort of"
]
# Intensifiers for emotional emphasis
self.intensifiers = [
"very", "extremely", "incredibly", "remarkably", "absolutely",
"totally", "completely", "utterly", "entirely", "thoroughly",
"ridiculously", "insanely", "super", "really", "quite",
"unbelievably", "amazingly", "surprisingly", "exceptionally"
]
# Discourse markers for transitions
self.discourse_markers = {
'contrast': ["however", "but", "nevertheless", "on the other hand", "conversely", "in contrast", "yet"],
'addition': ["moreover", "furthermore", "additionally", "also", "besides", "in addition", "plus"],
'cause_effect': ["therefore", "consequently", "thus", "hence", "as a result", "so", "because of this"],
'sequence': ["first", "second", "next", "then", "finally", "subsequently", "later"],
'example': ["for example", "for instance", "specifically", "to illustrate", "such as", "namely"],
'conclusion': ["in conclusion", "to sum up", "in summary", "overall", "ultimately", "in the end"],
'emphasis': ["indeed", "certainly", "in fact", "obviously", "clearly", "notably", "significantly"]
}
# Informal substitutions
self.informal_words = {
"approximately": ["about", "around"],
"assistance": ["help", "a hand"],
"attempt": ["try", "shot", "stab"],
"communicate": ["talk", "chat", "get in touch"],
"comprehend": ["get", "understand"],
"concerning": ["about", "on"],
"consume": ["eat", "drink", "use up"],
"currently": ["now", "right now"],
"decrease": ["drop", "cut", "fall"],
"difficult": ["hard", "tough"],
"encounter": ["meet", "run into", "bump into"],
"endeavor": ["try", "take a shot"],
"excessive": ["too much", "over the top"],
"expedite": ["speed up", "hurry"],
"facilitate": ["help", "make easier"],
"frequently": ["often", "a lot"],
"fundamental": ["basic", "key"],
"utilize": ["use"],
"purchase": ["buy", "get"],
"sufficient": ["enough"],
"inquire": ["ask"],
"obtain": ["get"],
"require": ["need"],
"additional": ["more", "extra"],
"residence": ["home", "place"]
}
# Common speech error patterns
self.speech_errors = {
'restarts': self._generate_restart,
'repetitions': self._generate_repetition,
'corrections': self._generate_correction,
'filled_pauses': self._generate_filled_pause,
'agreement_errors': self._generate_agreement_error
}
# Regional dialect variations (simplified)
self.regional_variations = {
'us_south': {
'you all': "y'all",
'going to': "gonna",
'want to': "wanna",
'did not': "didn't",
'yes': "yep",
'no': "nope"
},
'british': {
'apartment': "flat",
'elevator': "lift",
'trash': "rubbish",
'sidewalk': "pavement",
'vacation': "holiday",
'soccer': "football"
}
}
# Personality profiles (simplified)
self.personality_profiles = {
'casual': {
'contraction_rate': 0.8,
'informal_rate': 0.7,
'hedging_rate': 0.3,
'filler_rate': 0.4,
'error_rate': 0.1
},
'formal': {
'contraction_rate': 0.2,
'informal_rate': 0.1,
'hedging_rate': 0.5,
'filler_rate': 0.1,
'error_rate': 0.05
},
'academic': {
'contraction_rate': 0.1,
'informal_rate': 0.05,
'hedging_rate': 0.6,
'filler_rate': 0.1,
'error_rate': 0.02
},
'enthusiastic': {
'contraction_rate': 0.6,
'informal_rate': 0.5,
'hedging_rate': 0.2,
'filler_rate': 0.3,
'error_rate': 0.1,
'intensifier_rate': 0.7
}
}
# Emotional expression templates
self.emotional_expressions = {
'positive': [
"I'm so happy about {topic}!",
"This is amazing: {sentence}",
"I love how {sentence}",
"Wow, {sentence} That's incredible!",
"I'm really excited about {topic}."
],
'negative': [
"I'm not too thrilled about {topic}.",
"Unfortunately, {sentence}",
"I'm concerned that {sentence}",
"This is disappointing: {sentence}",
"I'm a bit worried about {topic}."
],
'neutral': [
"In my view, {sentence}",
"I think {sentence}",
"From what I understand, {sentence}",
"My take on {topic} is that {sentence}",
"When it comes to {topic}, {sentence}"
]
}
def humanize_text(self, text, intensity=0.5, personality='casual', add_errors=True, regional_dialect=None, emotional_tone=None):
"""
Enhanced main function to humanize text with multiple parameters for customization.
Args:
text (str): The input text to humanize
intensity (float): Controls how much the text is transformed (0.0 to 1.0)
personality (str): Personality profile to use ('casual', 'formal', 'academic', 'enthusiastic')
add_errors (bool): Whether to add realistic speech/typing errors
regional_dialect (str): Regional dialect to incorporate (None, 'us_south', 'british')
emotional_tone (str): Overall emotional tone (None, 'positive', 'negative', 'neutral')
Returns:
str: Humanized text
"""
if intensity < 0 or intensity > 1:
raise ValueError("Intensity must be between 0.0 and 1.0")
# Apply personality profile
profile = self.personality_profiles.get(personality, self.personality_profiles['casual'])
# Parse the text with spaCy for better linguistic analysis
doc = self.nlp(text)
# Split text into paragraphs
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
# Apply paragraph-level transformations
transformed_paragraphs = []
for para in paragraphs:
# Analyze sentiment if available
sentiment = self._analyze_sentiment(para) if emotional_tone is None and self.sentiment_analyzer else None
current_tone = emotional_tone or (sentiment['label'].lower() if sentiment else 'neutral')
# Add emotional expressions based on tone
if random.random() < intensity * 0.3 and current_tone in self.emotional_expressions:
# Find a topic in the paragraph
topic = self._extract_topic(para)
expression = random.choice(self.emotional_expressions[current_tone])
para = expression.format(topic=topic, sentence=para.lower() if para[0].isupper() else para)
# Apply paragraph transformations
for transform in self.paragraph_transformations:
if random.random() < intensity * 0.4:
para = transform(para, profile)
# Split paragraph into sentences
sentences = sent_tokenize(para)
# Apply sentence-level transformations
transformed_sentences = self._apply_sentence_transformations(sentences, intensity, profile)
# Apply regional dialect if specified
if regional_dialect and regional_dialect in self.regional_variations:
transformed_para = " ".join(transformed_sentences)
for original, variant in self.regional_variations[regional_dialect].items():
# Use word boundaries to avoid partial replacements
pattern = r'\b' + re.escape(original) + r'\b'
if random.random() < intensity * 0.7:
transformed_para = re.sub(pattern, variant, transformed_para, flags=re.IGNORECASE)
transformed_paragraphs.append(transformed_para)
else:
transformed_paragraphs.append(" ".join(transformed_sentences))
# Join paragraphs
result = "\n\n".join(transformed_paragraphs)
# Apply word-level transformations
result = self._apply_word_transformations(result, intensity, profile)
# Introduce speech errors if enabled
if add_errors and intensity > 0.2:
result = self._introduce_speech_errors(result, intensity * profile.get('error_rate', 0.1))
# Normalize spacing around punctuation
result = self._normalize_spacing(result)
return result
def _analyze_sentiment(self, text):
"""Analyze sentiment of the text using the sentiment analyzer."""
if self.sentiment_analyzer:
try:
return self.sentiment_analyzer(text)[0]
except:
pass
return None
def _extract_topic(self, text):
"""Extract a potential topic from the text using spaCy."""
doc = self.nlp(text)
# Try to find entities
entities = list(doc.ents)
if entities:
return entities[0].text
# Try to find noun chunks
chunks = list(doc.noun_chunks)
if chunks:
return chunks[0].text
# Fallback to first sentence
sentences = sent_tokenize(text)
if sentences:
words = word_tokenize(sentences[0])
if words:
return words[0]
return "this"
def _apply_sentence_transformations(self, sentences, intensity, profile):
"""Apply various sentence-level transformations with personality profile influence."""
result = []
i = 0
while i < len(sentences):
# Randomly decide whether to apply a transformation
if random.random() < intensity * 0.7:
# Weight transformations based on personality
weights = [
1.0, # merge_sentences
0.8, # split_sentences
0.5 if profile.get('hedging_rate', 0.3) > 0.4 else 0.2, # passive_to_active
0.2 if profile.get('hedging_rate', 0.3) > 0.4 else 0.5, # active_to_passive
profile.get('hedging_rate', 0.3), # add_hedging
profile.get('intensifier_rate', 0.4), # add_intensifiers
0.3 if profile.get('informal_rate', 0.5) > 0.5 else 0.1, # add_rhetorical_question
0.4 if profile.get('informal_rate', 0.5) > 0.4 else 0.2, # add_aside_comment
]
# Normalize weights
weights = [w / sum(weights) for w in weights]
# Choose a transformation based on weights
transformation = random.choices(self.sentence_transformations, weights=weights)[0]
# For transformations requiring two sentences
if transformation in [self.merge_sentences] and i < len(sentences) - 1:
transformed = transformation(sentences[i], sentences[i+1])
result.append(transformed)
i += 2
# For transformations requiring one sentence
else:
transformed = transformation(sentences[i], "")
result.append(transformed)
i += 1
else:
result.append(sentences[i])
i += 1
return result
def _apply_word_transformations(self, text, intensity, profile):
"""Apply various word-level transformations with personality profile influence."""
# Parse the text with spaCy for better context
doc = self.nlp(text)
# Apply transformations separately to preserve sentence structure
sentences = [sent.text for sent in doc.sents]
transformed_sentences = []
for sentence in sentences:
# Tokenize sentence
sentence_doc = self.nlp(sentence)
# Build a new sentence from tokens
new_tokens = []
i = 0
while i < len(sentence_doc):
token = sentence_doc[i]
# Skip punctuation for most transformations
if token.is_punct:
new_tokens.append(token.text)
i += 1
continue
# Randomly decide whether to apply a transformation
if random.random() < intensity * 0.5:
# Weight transformations based on personality
weights = [
0.6, # contextual_synonym_replacement
profile.get('contraction_rate', 0.5), # contraction_expansion
profile.get('filler_rate', 0.3), # add_filler_words
profile.get('informal_rate', 0.5), # informal_substitution
0.3 if profile.get('intensifier_rate', 0.4) > 0.5 else 0.1, # add_emphatic_repetition
]
# Normalize weights
weights = [w / sum(weights) for w in weights]
# Choose a transformation based on weights
transformation = random.choices(self.word_transformations, weights=weights)[0]
# Apply transformation
if transformation == self.contextual_synonym_replacement:
transformed = transformation(token, sentence_doc)
elif transformation == self.contraction_expansion:
# Need to check if this is a multi-word expansion
if token.text.lower() in self.expansions:
# This is a potential expansion point
expansion = self.expansions[token.text.lower()]
if ' ' in expansion and i < len(sentence_doc) - 1:
# Check if the next tokens match the expansion
expansion_parts = expansion.split()
if expansion_parts[0].lower() == token.text.lower() and expansion_parts[1].lower() == sentence_doc[i+1].text.lower():
# Apply contraction
transformed = expansion
i += 1 # Skip the next token
else:
transformed = token.text
else:
transformed = token.text
else:
transformed = transformation(token)
elif transformation == self.add_filler_words:
# Add a filler word before the current word
if random.random() < 0.3: # Only occasionally add fillers
filler_category = random.choice(list(self.filler_words.keys()))
filler = random.choice(self.filler_words[filler_category])
new_tokens.append(filler)
transformed = token.text
elif transformation == self.informal_substitution:
transformed = transformation(token)
elif transformation == self.add_emphatic_repetition:
transformed = transformation(token)
else:
transformed = token.text
new_tokens.append(transformed)
else:
new_tokens.append(token.text)
i += 1
# Recreate the sentence from tokens
transformed_sentence = self._reconstruct_sentence(new_tokens)
transformed_sentences.append(transformed_sentence)
# Join the transformed sentences
result = " ".join(transformed_sentences)
return result
def _reconstruct_sentence(self, tokens):
"""Reconstruct a sentence from tokens, preserving proper spacing."""
result = ""
for i, token in enumerate(tokens):
# Handle special cases for punctuation
if token in ".,!?;:)]}" and result:
result = result.rstrip() + token + " "
# Don't add space after opening brackets
elif i > 0 and tokens[i-1] in "([{" and result:
result = result.rstrip() + token + " "
elif token in "([{" and result:
result = result.rstrip() + token
# Handle quotes
elif token in ['\'', '"'] and result and result[-1] != " ":
result += token + " "
else:
result += token + " "
return result.strip()
def _introduce_speech_errors(self, text, error_rate):
"""Introduce realistic speech/typing errors."""
words = text.split()
result = []
for i, word in enumerate(words):
if len(word) > 2 and random.random() < error_rate:
# Select a random error type
error_type = random.choice(list(self.speech_errors.keys()))
# Apply the error
error_func = self.speech_errors[error_type]
if error_type in ['restarts', 'repetitions'] and i > 0:
# These errors need previous context
modified = error_func(words[i-1], word)
result.pop() # Remove the previous word
result.append(modified)
else:
result.append(error_func(word))
else:
result.append(word)
return " ".join(result)
def _normalize_spacing(self, text):
"""Fix spacing around punctuation for a more natural look."""
# Fix spacing around punctuation
text = re.sub(r'\s+([.,;:!?)])', r'\1', text)
text = re.sub(r'([(])\s+', r'\1', text)
# Fix multiple spaces
text = re.sub(r'\s{2,}', ' ', text)
return text
# Speech error generators
def _generate_restart(self, prev_word, word):
"""Generate a restart error (e.g., "I was- I was saying")."""
return f"{prev_word}- {prev_word} {word}"
def _generate_repetition(self, prev_word, word):
"""Generate a word repetition (e.g., "the the cat")."""
return f"{prev_word} {prev_word} {word}"
def _generate_correction(self, word):
"""Generate a self-correction (e.g., "teh the")."""
if len(word) < 4:
return word
# Create a simple typo
i = random.randint(0, len(word) - 2)
typo = word[:i] + word[i+1] + word[i] + word[i+2:]
# Choose correction style
correction_style = random.choice(["asterisk", "dash", "explicit"])
if correction_style == "asterisk":
return f"{typo}*{word}*"
elif correction_style == "dash":
return f"{typo}-{word}"
else:
return f"{typo}, I mean {word}"
def _generate_filled_pause(self, word):
"""Generate a filled pause (e.g., "um, like")."""
filler = random.choice(self.filler_words['hesitation'])
return f"{filler}, {word}"
def _generate_agreement_error(self, word):
"""Generate a subject-verb agreement error (simplified)."""
if word.endswith('s') and len(word) > 3:
return word[:-1]
elif not word.endswith('s') and random.random() < 0.5:
return word + 's'
return word
# Enhanced sentence-level transformations
def merge_sentences(self, sent1, sent2):
"""Merge two sentences with a conjunction."""
# Remove the period from the first sentence
if sent1.endswith('.'):
sent1 = sent1[:-1]
# Choose a conjunction based on the content
doc1 = self.nlp(sent1)
doc2 = self.nlp(sent2)
# Check for content relationship
similarity = doc1.similarity(doc2)
if similarity > 0.7:
# Highly similar, use addition
conjunction = random.choice(["and", "also", "moreover", "furthermore"])
elif similarity < 0.3:
# Dissimilar, use contrast
conjunction = random.choice(["but", "however", "on the other hand", "yet"])
else:
# Moderate similarity, use general conjunction
conjunction = random.choice(["and", "while", "so", "because", "although"])
# Merge the sentences
return f"{sent1} {conjunction} {sent2.lower() if sent2 and sent2[0].isupper() else sent2}"
def split_sentences(self, sent, _):
"""Split a longer sentence into two with improved linguistic awareness."""
doc = self.nlp(sent)
tokens = [token for token in doc]
# Only split if sentence is long enough
if len(tokens) < 8:
return sent
# Find a good split point based on dependency structure
potential_splits = []
for i, token in enumerate(tokens):
# Good split points are often after conjunctions or punctuation
if (token.dep_ in ['cc', 'prep', 'mark'] or token.pos_ == 'PUNCT') and 3 < i < len(tokens) - 3:
potential_splits.append((i, 1)) # Higher weight for these
# Or before a new clause
elif token.dep_ in ['nsubj', 'nsubjpass'] and i > 3:
potential_splits.append((i, 0.8))
if not potential_splits:
# Fallback to middle
split_point = len(tokens) // 2
else:
# Choose a split point with weighted random selection
points, weights = zip(*potential_splits)
split_point = random.choices(points, weights=weights)[0]
# Create two new sentences
sent1 = "".join([t.text_with_ws for t in tokens[:split_point]])
sent2 = "".join([t.text_with_ws for t in tokens[split_point:]])
# Ensure proper capitalization and punctuation
sent1 = sent1.rstrip()
if not sent1.endswith(('.', '!', '?')):
sent1 += '.'
sent2 = sent2.strip()
if sent2 and sent2[0].islower():
sent2 = sent2[0].upper() + sent2[1:]
return f"{sent1} {sent2}"
def passive_to_active(self, sent, _):
"""Convert passive voice to active voice using spaCy's dependency parsing."""
doc = self.nlp(sent)
# Look for passive constructions
for token in doc:
if token.dep_ == "nsubjpass":
# Found passive voice
subject = token
agent = None
verb = token.head
# Find the agent (often introduced by "by")
for child in doc:
if child.dep_ == "agent" and child.head == verb:
for grandchild in child.children:
if grandchild.dep_ in ["pobj", "nmod"]:
agent = grandchild
break
if agent:
# Extract the core components
subj_span = doc[subject.left_edge.i:subject.right_edge.i+1].text
verb_span = doc[verb.i:verb.i+1].text
agent_span = doc[agent.left_edge.i:agent.right_edge.i+1].text
# Reconstruct in active voice
active_verb = verb_span.replace("was ", "").replace("were ", "")
# Remove trailing period for reconstruction
if sent.endswith('.'):
new_sent = f"{agent_span} {active_verb} {subj_span}."
else:
new_sent = f"{agent_span} {active_verb} {subj_span}"
return new_sent
# If no passive construction found or couldn't convert
return sent
def active_to_passive(self, sent, _):
"""Convert active voice to passive voice using spaCy's dependency parsing."""
doc = self.nlp(sent)
# Look for active voice constructions
for token in doc:
if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
# Found a subject and verb
subject = token
verb = token.head
# Find the direct object
obj = None
for child in verb.children:
if child.dep_ in ["dobj", "obj"]:
obj = child
break
if obj:
# Extract the core components
subj_span = doc[subject.left_edge.i:subject.right_edge.i+1].text
verb_span = doc[verb.i:verb.i+1].text
obj_span = doc[obj.left_edge.i:obj.right_edge.i+1].text
# Determine the passive verb form
passive_verb = verb_span
if verb_span.endswith("s"):
passive_verb = passive_verb[:-1]
# Reconstruct in passive voice
# Remove trailing period for reconstruction
if sent.endswith('.'):
new_sent = f"{obj_span} was {passive_verb}ed by {subj_span}."
else:
new_sent = f"{obj_span} was {passive_verb}ed by {subj_span}"
return new_sent
# If no active construction found or couldn't convert
return sent
def add_hedging(self, sent, _):
"""Add hedging language to a statement."""
# Add a hedging phrase at the beginning of the sentence
hedging = random.choice(self.hedging_phrases)
# For questions, add hedging at the end
if sent.endswith('?'):
return f"{sent[:-1]}, {hedging.lower()}?"
# For statements, add at the beginning
if sent[0].isupper():
return f"{hedging}, {sent[0].lower() + sent[1:]}"
return f"{hedging}, {sent}"
def add_intensifiers(self, sent, _):
"""Add intensifiers to adjectives and adverbs."""
doc = self.nlp(sent)
words = list(doc)
result = []
for i, token in enumerate(words):
# Add intensifier before adjectives and adverbs
if token.pos_ in ["ADJ", "ADV"] and random.random() < 0.6:
# Choose an appropriate intensifier
intensifier = random.choice(self.intensifiers)
# Add the intensifier before the adjective/adverb
result.append(intensifier)
# Add the current token
result.append(token.text)
return " ".join(result)
def add_rhetorical_question(self, sent, _):
"""Add a rhetorical question related to the statement."""
# Create a rhetorical question based on the content
doc = self.nlp(sent)
# Extract key information
subjects = [tok for tok in doc if tok.dep_ in ["nsubj", "nsubjpass"]]
if subjects and random.random() < 0.7:
subject = subjects[0].text
# Various question templates
templates = [
f"Isn't that interesting about {subject}?",
f"Don't you think so?",
f"Right?",
f"You know what I mean?",
f"Can you imagine?",
f"Who would have thought?",
f"Why is that so important?"
]
return f"{sent} {random.choice(templates)}"
return sent
def add_aside_comment(self, sent, _):
"""Add a parenthetical aside or comment."""
# Inject an aside comment in the middle or end of the sentence
doc = self.nlp(sent)
words = [token.text for token in doc]
# Choose position for the aside
if len(words) > 5:
position = random.randint(3, len(words) - 2) if len(words) > 5 else len(words)
else:
# If sentence is too short, add at the end
position = len(words)
# Create aside comments
asides = [
"by the way",
"if you ask me",
"I think",
"you know",
"to be honest",
"believe it or not",
"interestingly",
"surprisingly",
"and this is important"
]
aside = random.choice(asides)
# Insert the aside
if position < len(words):
# Insert in the middle, with commas
words.insert(position, f", {aside},")
else:
# Add at the end
if sent.endswith('.'):
words[-1] = words[-1][:-1] # Remove the period
words.append(f", {aside}.")
else:
words.append(f", {aside}")
return " ".join(words)
# Word-level transformations
def contextual_synonym_replacement(self, token, doc):
"""Replace a word with a contextually appropriate synonym."""
# Only replace content words
if token.pos_ not in ["NOUN", "VERB", "ADJ", "ADV"] or token.is_stop:
return token.text
# Find synonyms using WordNet
synonyms = []
for syn in wordnet.synsets(token.text):
for lemma in syn.lemmas():
synonym = lemma.name().replace('_', ' ')
if synonym != token.text and synonym not in synonyms:
synonyms.append(synonym)
# If no synonyms found, return original
if not synonyms:
return token.text
# Filter synonyms that fit the context
filtered_synonyms = []
for synonym in synonyms[:5]: # Limit checking to 5 synonyms for efficiency
# Create a new document with the synonym
new_text = doc.text.replace(token.text, synonym)
new_doc = self.nlp(new_text)
# Calculate similarity between original and modified text
similarity = doc.similarity(new_doc)
if similarity > 0.8: # High semantic similarity threshold
filtered_synonyms.append((synonym, similarity))
# If no good contextual synonyms, return original
if not filtered_synonyms:
return token.text
# Sort by similarity and choose one of the top options
filtered_synonyms.sort(key=lambda x: x[1], reverse=True)
return random.choice(filtered_synonyms[:3])[0]
def contraction_expansion(self, token):
"""Toggle between contractions and their expansions."""
if token.text.lower() in self.contractions:
# Expand a contraction
return self.contractions[token.text.lower()]
elif token.text.lower() in self.expansions:
# Contract an expansion
return self.expansions[token.text.lower()]
return token.text
def add_filler_words(self, token):
"""Add filler words appropriate to the context."""
# Determine appropriate filler category based on token properties
filler_category = None
if token.pos_ == "ADJ":
filler_category = random.choice(["emphasis", "hedging"])
elif token.pos_ == "VERB":
filler_category = random.choice(["hesitation", "emphasis"])
elif token.pos_ == "NOUN":
filler_category = random.choice(["clarification", "informal"])
else:
filler_category = random.choice(list(self.filler_words.keys()))
filler = random.choice(self.filler_words[filler_category])
# Add the filler before the token
return f"{filler} {token.text}"
def informal_substitution(self, token):
"""Replace formal words with informal alternatives."""
if token.text.lower() in self.informal_words:
return random.choice(self.informal_words[token.text.lower()])
return token.text
def add_emphatic_repetition(self, token):
"""Add emphatic repetition for emphasis."""
# Only repeat certain word types
if token.pos_ in ["ADJ", "ADV"] and len(token.text) > 2:
# Choose repetition style
style = random.choice(["hyphen", "comma", "simple"])
if style == "hyphen":
return f"{token.text}-{token.text}"
elif style == "comma":
return f"{token.text}, {token.text}"
else:
return f"{token.text} {token.text}"
return token.text
# Paragraph-level transformations
def add_discourse_markers(self, paragraph, profile):
"""Add discourse markers to enhance cohesion."""
sentences = sent_tokenize(paragraph)
if len(sentences) <= 1:
return paragraph
# Determine appropriate markers based on content
marker_types = list(self.discourse_markers.keys())
weighted_types = random.choices(
marker_types,
weights=[0.2, 0.2, 0.2, 0.15, 0.1, 0.1, 0.05],
k=min(len(sentences)-1, 3) # Don't add too many markers
)
# Add markers to random sentences
num_markers = min(len(sentences) - 1, max(1, int(len(sentences) * 0.5)))
positions = sorted(random.sample(range(1, len(sentences)), num_markers))
for i, pos in enumerate(positions):
marker_type = weighted_types[i % len(weighted_types)]
marker = random.choice(self.discourse_markers[marker_type])
# Add the marker at the beginning of the sentence
sentences[pos] = f"{marker}, {sentences[pos][0].lower() + sentences[pos][1:]}"
return " ".join(sentences)
def adjust_formality(self, paragraph, profile):
"""Adjust the overall formality of the paragraph."""
formality_level = profile.get('informal_rate', 0.5)
# For formal text (low informality)
if formality_level < 0.3:
# Replace contractions with expansions
for contraction, expansion in self.contractions.items():
pattern = r'\b' + re.escape(contraction) + r'\b'
paragraph = re.sub(pattern, expansion, paragraph, flags=re.IGNORECASE)
# Remove certain informal phrases
informal_phrases = ["you know", "like", "kinda", "sort of", "pretty much"]
for phrase in informal_phrases:
paragraph = re.sub(r'\b' + re.escape(phrase) + r'\b', '', paragraph, flags=re.IGNORECASE)
# For informal text (high informality)
elif formality_level > 0.7:
# Replace formal words with informal alternatives
for formal, informals in self.informal_words.items():
pattern = r'\b' + re.escape(formal) + r'\b'
if random.random() < 0.7:
replacement = random.choice(informals)
paragraph = re.sub(pattern, replacement, paragraph, flags=re.IGNORECASE)
# Add contractions
for expansion, contraction in self.expansions.items():
if ' ' in expansion: # Only multi-word expansions
pattern = r'\b' + re.escape(expansion) + r'\b'
paragraph = re.sub(pattern, contraction, paragraph, flags=re.IGNORECASE)
return paragraph
def add_cohesion_devices(self, paragraph, profile):
"""Add cohesion devices like pronouns and references."""
sentences = sent_tokenize(paragraph)
if len(sentences) <= 1:
return paragraph
# Parse the paragraph
doc = self.nlp(paragraph)
# Extract key entities
entities = {}
for ent in doc.ents:
if ent.label_ in ["PERSON", "ORG", "GPE", "LOC", "PRODUCT"]:
if ent.text not in entities:
entities[ent.text] = []
entities[ent.text].append(ent.label_)
# Extract key nouns
nouns = [token.text for token in doc if token.pos_ == "NOUN" and len(token.text) > 3]
# If no entities or nouns found, return original
if not entities and not nouns:
return paragraph
# Choose an entity or noun to reference
reference_subject = None
if entities:
reference_subject = random.choice(list(entities.keys()))
elif nouns:
reference_subject = random.choice(nouns)
if not reference_subject:
return paragraph
# Choose a reference type
ref_type = random.choice(["pronoun", "determiner", "repetition"])
# Apply the reference in a later sentence
if ref_type == "pronoun":
# Simple pronoun substitution (could be improved with gender recognition)
pronouns = ["it", "they", "this", "these", "that", "those"]
pronoun = random.choice(pronouns)
# Find a sentence with the reference subject
for i, sent in enumerate(sentences):
if reference_subject in sent and i < len(sentences) - 1:
# Replace in the next sentence if possible
next_sent = sentences[i+1]
if reference_subject in next_sent:
sentences[i+1] = next_sent.replace(reference_subject, pronoun, 1)
break
elif ref_type == "determiner":
# Add a determiner phrase
determiners = ["this", "that", "these", "those", "the", "such a"]
determiner = random.choice(determiners)
# Find a sentence with the reference subject
for i, sent in enumerate(sentences):
if reference_subject in sent and i < len(sentences) - 1:
# Add in the next sentence if possible
sentences[i+1] = sentences[i+1].replace(
reference_subject,
f"{determiner} {reference_subject}",
1
)
break
return " ".join(sentences)
# Example usage
if __name__ == "__main__":
# Initialize the humanizer
humanizer = EnhancedTextHumanizer()
# Test text
original_text = """
Artificial intelligence has significantly impacted numerous industries.
It has improved efficiency in manufacturing through automation.
The healthcare sector has benefited from better diagnostic tools.
Machine learning algorithms continue to advance and provide new solutions.
Companies invest heavily in AI research and development.
"""
# Test different personality types
for personality in ['casual', 'formal', 'academic', 'enthusiastic']:
print(f"\n--- {personality.upper()} PERSONALITY ---")
humanized = humanizer.humanize_text(
original_text,
intensity=0.7,
personality=personality
)
print(humanized)
# Test with regional dialect
print("\n--- REGIONAL DIALECT (US SOUTH) ---")
humanized = humanizer.humanize_text(
original_text,
intensity=0.7,
personality='casual',
regional_dialect='us_south'
)
print(humanized)
# Test with emotional tone
print("\n--- EMOTIONAL TONE (POSITIVE) ---")
humanized = humanizer.humanize_text(
original_text,
intensity=0.7,
personality='enthusiastic',
emotional_tone='positive'
)
print(humanized)