import nltk import random import re import spacy import numpy as np from nltk.corpus import wordnet from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tag import pos_tag from collections import defaultdict from transformers import pipeline spacy.load('en_core_web_sm') # With this code: try: nltk.data.find('tokenizers/punkt') nltk.data.find('taggers/averaged_perceptron_tagger') nltk.data.find('corpora/wordnet') except LookupError as e: print(f"NLTK resource error: {e}") print("Attempting to download missing resources...") nltk.download('punkt') nltk.download('punkt_tab') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') class EnhancedTextHumanizer: def __init__(self): # Load spaCy model self.nlp = spacy.load("en_core_web_sm") # Initialize sentiment analysis pipeline - for emotional variations try: self.sentiment_analyzer = pipeline("sentiment-analysis") except: print("Warning: Transformers sentiment analysis not available. Emotional variations will be limited.") self.sentiment_analyzer = None # Sentence-level transformations self.sentence_transformations = [ self.merge_sentences, self.split_sentences, self.passive_to_active, self.active_to_passive, self.add_hedging, self.add_intensifiers, self.add_rhetorical_question, self.add_aside_comment ] # Word-level transformations self.word_transformations = [ self.contextual_synonym_replacement, self.contraction_expansion, self.add_filler_words, self.informal_substitution, self.add_emphatic_repetition ] # Paragraph-level transformations self.paragraph_transformations = [ self.add_discourse_markers, self.adjust_formality, self.add_cohesion_devices ] # Filler words with categorization self.filler_words = { 'hesitation': ["um", "uh", "er", "hmm", "like"], 'emphasis': ["actually", "literally", "seriously", "honestly", "truly", "really"], 'hedging': ["maybe", "perhaps", "probably", "possibly", "somewhat", "kinda", "sort of"], 'clarification': ["I mean", "you know", "what I'm saying", "in other words"], 'informal': ["basically", "totally", "absolutely", "pretty much", "y'know"] } # Contractions and their expansions self.contractions = { "can't": "cannot", "won't": "will not", "don't": "do not", "doesn't": "does not", "I'm": "I am", "you're": "you are", "they're": "they are", "we're": "we are", "it's": "it is", "that's": "that is", "who's": "who is", "what's": "what is", "there's": "there is", "here's": "here is", "he's": "he is", "she's": "she is", "I've": "I have", "you've": "you have", "we've": "we have", "they've": "they have", "I'll": "I will", "you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will", "they'll": "they will", "I'd": "I would", "you'd": "you would", "he'd": "he would", "she'd": "she would", "we'd": "we would", "they'd": "they would", "let's": "let us", "ain't": "am not", "wasn't": "was not", "weren't": "were not", "hasn't": "has not", "haven't": "have not", "couldn't": "could not", "shouldn't": "should not", "wouldn't": "would not", "didn't": "did not", "isn't": "is not", "aren't": "are not", "mightn't": "might not", "mustn't": "must not", "shan't": "shall not", "needn't": "need not" } # Reverse contractions dictionary self.expansions = {v: k for k, v in self.contractions.items()} # Common hedging phrases self.hedging_phrases = [ "I think", "It seems", "It appears", "From what I understand", "As far as I know", "In my opinion", "Arguably", "Presumably", "It could be that", "It's possible that", "One might say", "To some extent", "More or less", "Kind of", "Sort of" ] # Intensifiers for emotional emphasis self.intensifiers = [ "very", "extremely", "incredibly", "remarkably", "absolutely", "totally", "completely", "utterly", "entirely", "thoroughly", "ridiculously", "insanely", "super", "really", "quite", "unbelievably", "amazingly", "surprisingly", "exceptionally" ] # Discourse markers for transitions self.discourse_markers = { 'contrast': ["however", "but", "nevertheless", "on the other hand", "conversely", "in contrast", "yet"], 'addition': ["moreover", "furthermore", "additionally", "also", "besides", "in addition", "plus"], 'cause_effect': ["therefore", "consequently", "thus", "hence", "as a result", "so", "because of this"], 'sequence': ["first", "second", "next", "then", "finally", "subsequently", "later"], 'example': ["for example", "for instance", "specifically", "to illustrate", "such as", "namely"], 'conclusion': ["in conclusion", "to sum up", "in summary", "overall", "ultimately", "in the end"], 'emphasis': ["indeed", "certainly", "in fact", "obviously", "clearly", "notably", "significantly"] } # Informal substitutions self.informal_words = { "approximately": ["about", "around"], "assistance": ["help", "a hand"], "attempt": ["try", "shot", "stab"], "communicate": ["talk", "chat", "get in touch"], "comprehend": ["get", "understand"], "concerning": ["about", "on"], "consume": ["eat", "drink", "use up"], "currently": ["now", "right now"], "decrease": ["drop", "cut", "fall"], "difficult": ["hard", "tough"], "encounter": ["meet", "run into", "bump into"], "endeavor": ["try", "take a shot"], "excessive": ["too much", "over the top"], "expedite": ["speed up", "hurry"], "facilitate": ["help", "make easier"], "frequently": ["often", "a lot"], "fundamental": ["basic", "key"], "utilize": ["use"], "purchase": ["buy", "get"], "sufficient": ["enough"], "inquire": ["ask"], "obtain": ["get"], "require": ["need"], "additional": ["more", "extra"], "residence": ["home", "place"] } # Common speech error patterns self.speech_errors = { 'restarts': self._generate_restart, 'repetitions': self._generate_repetition, 'corrections': self._generate_correction, 'filled_pauses': self._generate_filled_pause, 'agreement_errors': self._generate_agreement_error } # Regional dialect variations (simplified) self.regional_variations = { 'us_south': { 'you all': "y'all", 'going to': "gonna", 'want to': "wanna", 'did not': "didn't", 'yes': "yep", 'no': "nope" }, 'british': { 'apartment': "flat", 'elevator': "lift", 'trash': "rubbish", 'sidewalk': "pavement", 'vacation': "holiday", 'soccer': "football" } } # Personality profiles (simplified) self.personality_profiles = { 'casual': { 'contraction_rate': 0.8, 'informal_rate': 0.7, 'hedging_rate': 0.3, 'filler_rate': 0.4, 'error_rate': 0.1 }, 'formal': { 'contraction_rate': 0.2, 'informal_rate': 0.1, 'hedging_rate': 0.5, 'filler_rate': 0.1, 'error_rate': 0.05 }, 'academic': { 'contraction_rate': 0.1, 'informal_rate': 0.05, 'hedging_rate': 0.6, 'filler_rate': 0.1, 'error_rate': 0.02 }, 'enthusiastic': { 'contraction_rate': 0.6, 'informal_rate': 0.5, 'hedging_rate': 0.2, 'filler_rate': 0.3, 'error_rate': 0.1, 'intensifier_rate': 0.7 } } # Emotional expression templates self.emotional_expressions = { 'positive': [ "I'm so happy about {topic}!", "This is amazing: {sentence}", "I love how {sentence}", "Wow, {sentence} That's incredible!", "I'm really excited about {topic}." ], 'negative': [ "I'm not too thrilled about {topic}.", "Unfortunately, {sentence}", "I'm concerned that {sentence}", "This is disappointing: {sentence}", "I'm a bit worried about {topic}." ], 'neutral': [ "In my view, {sentence}", "I think {sentence}", "From what I understand, {sentence}", "My take on {topic} is that {sentence}", "When it comes to {topic}, {sentence}" ] } def humanize_text(self, text, intensity=0.5, personality='casual', add_errors=True, regional_dialect=None, emotional_tone=None): """ Enhanced main function to humanize text with multiple parameters for customization. Args: text (str): The input text to humanize intensity (float): Controls how much the text is transformed (0.0 to 1.0) personality (str): Personality profile to use ('casual', 'formal', 'academic', 'enthusiastic') add_errors (bool): Whether to add realistic speech/typing errors regional_dialect (str): Regional dialect to incorporate (None, 'us_south', 'british') emotional_tone (str): Overall emotional tone (None, 'positive', 'negative', 'neutral') Returns: str: Humanized text """ if intensity < 0 or intensity > 1: raise ValueError("Intensity must be between 0.0 and 1.0") # Apply personality profile profile = self.personality_profiles.get(personality, self.personality_profiles['casual']) # Parse the text with spaCy for better linguistic analysis doc = self.nlp(text) # Split text into paragraphs paragraphs = [p.strip() for p in text.split('\n') if p.strip()] # Apply paragraph-level transformations transformed_paragraphs = [] for para in paragraphs: # Analyze sentiment if available sentiment = self._analyze_sentiment(para) if emotional_tone is None and self.sentiment_analyzer else None current_tone = emotional_tone or (sentiment['label'].lower() if sentiment else 'neutral') # Add emotional expressions based on tone if random.random() < intensity * 0.3 and current_tone in self.emotional_expressions: # Find a topic in the paragraph topic = self._extract_topic(para) expression = random.choice(self.emotional_expressions[current_tone]) para = expression.format(topic=topic, sentence=para.lower() if para[0].isupper() else para) # Apply paragraph transformations for transform in self.paragraph_transformations: if random.random() < intensity * 0.4: para = transform(para, profile) # Split paragraph into sentences sentences = sent_tokenize(para) # Apply sentence-level transformations transformed_sentences = self._apply_sentence_transformations(sentences, intensity, profile) # Apply regional dialect if specified if regional_dialect and regional_dialect in self.regional_variations: transformed_para = " ".join(transformed_sentences) for original, variant in self.regional_variations[regional_dialect].items(): # Use word boundaries to avoid partial replacements pattern = r'\b' + re.escape(original) + r'\b' if random.random() < intensity * 0.7: transformed_para = re.sub(pattern, variant, transformed_para, flags=re.IGNORECASE) transformed_paragraphs.append(transformed_para) else: transformed_paragraphs.append(" ".join(transformed_sentences)) # Join paragraphs result = "\n\n".join(transformed_paragraphs) # Apply word-level transformations result = self._apply_word_transformations(result, intensity, profile) # Introduce speech errors if enabled if add_errors and intensity > 0.2: result = self._introduce_speech_errors(result, intensity * profile.get('error_rate', 0.1)) # Normalize spacing around punctuation result = self._normalize_spacing(result) return result def _analyze_sentiment(self, text): """Analyze sentiment of the text using the sentiment analyzer.""" if self.sentiment_analyzer: try: return self.sentiment_analyzer(text)[0] except: pass return None def _extract_topic(self, text): """Extract a potential topic from the text using spaCy.""" doc = self.nlp(text) # Try to find entities entities = list(doc.ents) if entities: return entities[0].text # Try to find noun chunks chunks = list(doc.noun_chunks) if chunks: return chunks[0].text # Fallback to first sentence sentences = sent_tokenize(text) if sentences: words = word_tokenize(sentences[0]) if words: return words[0] return "this" def _apply_sentence_transformations(self, sentences, intensity, profile): """Apply various sentence-level transformations with personality profile influence.""" result = [] i = 0 while i < len(sentences): # Randomly decide whether to apply a transformation if random.random() < intensity * 0.7: # Weight transformations based on personality weights = [ 1.0, # merge_sentences 0.8, # split_sentences 0.5 if profile.get('hedging_rate', 0.3) > 0.4 else 0.2, # passive_to_active 0.2 if profile.get('hedging_rate', 0.3) > 0.4 else 0.5, # active_to_passive profile.get('hedging_rate', 0.3), # add_hedging profile.get('intensifier_rate', 0.4), # add_intensifiers 0.3 if profile.get('informal_rate', 0.5) > 0.5 else 0.1, # add_rhetorical_question 0.4 if profile.get('informal_rate', 0.5) > 0.4 else 0.2, # add_aside_comment ] # Normalize weights weights = [w / sum(weights) for w in weights] # Choose a transformation based on weights transformation = random.choices(self.sentence_transformations, weights=weights)[0] # For transformations requiring two sentences if transformation in [self.merge_sentences] and i < len(sentences) - 1: transformed = transformation(sentences[i], sentences[i+1]) result.append(transformed) i += 2 # For transformations requiring one sentence else: transformed = transformation(sentences[i], "") result.append(transformed) i += 1 else: result.append(sentences[i]) i += 1 return result def _apply_word_transformations(self, text, intensity, profile): """Apply various word-level transformations with personality profile influence.""" # Parse the text with spaCy for better context doc = self.nlp(text) # Apply transformations separately to preserve sentence structure sentences = [sent.text for sent in doc.sents] transformed_sentences = [] for sentence in sentences: # Tokenize sentence sentence_doc = self.nlp(sentence) # Build a new sentence from tokens new_tokens = [] i = 0 while i < len(sentence_doc): token = sentence_doc[i] # Skip punctuation for most transformations if token.is_punct: new_tokens.append(token.text) i += 1 continue # Randomly decide whether to apply a transformation if random.random() < intensity * 0.5: # Weight transformations based on personality weights = [ 0.6, # contextual_synonym_replacement profile.get('contraction_rate', 0.5), # contraction_expansion profile.get('filler_rate', 0.3), # add_filler_words profile.get('informal_rate', 0.5), # informal_substitution 0.3 if profile.get('intensifier_rate', 0.4) > 0.5 else 0.1, # add_emphatic_repetition ] # Normalize weights weights = [w / sum(weights) for w in weights] # Choose a transformation based on weights transformation = random.choices(self.word_transformations, weights=weights)[0] # Apply transformation if transformation == self.contextual_synonym_replacement: transformed = transformation(token, sentence_doc) elif transformation == self.contraction_expansion: # Need to check if this is a multi-word expansion if token.text.lower() in self.expansions: # This is a potential expansion point expansion = self.expansions[token.text.lower()] if ' ' in expansion and i < len(sentence_doc) - 1: # Check if the next tokens match the expansion expansion_parts = expansion.split() if expansion_parts[0].lower() == token.text.lower() and expansion_parts[1].lower() == sentence_doc[i+1].text.lower(): # Apply contraction transformed = expansion i += 1 # Skip the next token else: transformed = token.text else: transformed = token.text else: transformed = transformation(token) elif transformation == self.add_filler_words: # Add a filler word before the current word if random.random() < 0.3: # Only occasionally add fillers filler_category = random.choice(list(self.filler_words.keys())) filler = random.choice(self.filler_words[filler_category]) new_tokens.append(filler) transformed = token.text elif transformation == self.informal_substitution: transformed = transformation(token) elif transformation == self.add_emphatic_repetition: transformed = transformation(token) else: transformed = token.text new_tokens.append(transformed) else: new_tokens.append(token.text) i += 1 # Recreate the sentence from tokens transformed_sentence = self._reconstruct_sentence(new_tokens) transformed_sentences.append(transformed_sentence) # Join the transformed sentences result = " ".join(transformed_sentences) return result def _reconstruct_sentence(self, tokens): """Reconstruct a sentence from tokens, preserving proper spacing.""" result = "" for i, token in enumerate(tokens): # Handle special cases for punctuation if token in ".,!?;:)]}" and result: result = result.rstrip() + token + " " # Don't add space after opening brackets elif i > 0 and tokens[i-1] in "([{" and result: result = result.rstrip() + token + " " elif token in "([{" and result: result = result.rstrip() + token # Handle quotes elif token in ['\'', '"'] and result and result[-1] != " ": result += token + " " else: result += token + " " return result.strip() def _introduce_speech_errors(self, text, error_rate): """Introduce realistic speech/typing errors.""" words = text.split() result = [] for i, word in enumerate(words): if len(word) > 2 and random.random() < error_rate: # Select a random error type error_type = random.choice(list(self.speech_errors.keys())) # Apply the error error_func = self.speech_errors[error_type] if error_type in ['restarts', 'repetitions'] and i > 0: # These errors need previous context modified = error_func(words[i-1], word) result.pop() # Remove the previous word result.append(modified) else: result.append(error_func(word)) else: result.append(word) return " ".join(result) def _normalize_spacing(self, text): """Fix spacing around punctuation for a more natural look.""" # Fix spacing around punctuation text = re.sub(r'\s+([.,;:!?)])', r'\1', text) text = re.sub(r'([(])\s+', r'\1', text) # Fix multiple spaces text = re.sub(r'\s{2,}', ' ', text) return text # Speech error generators def _generate_restart(self, prev_word, word): """Generate a restart error (e.g., "I was- I was saying").""" return f"{prev_word}- {prev_word} {word}" def _generate_repetition(self, prev_word, word): """Generate a word repetition (e.g., "the the cat").""" return f"{prev_word} {prev_word} {word}" def _generate_correction(self, word): """Generate a self-correction (e.g., "teh the").""" if len(word) < 4: return word # Create a simple typo i = random.randint(0, len(word) - 2) typo = word[:i] + word[i+1] + word[i] + word[i+2:] # Choose correction style correction_style = random.choice(["asterisk", "dash", "explicit"]) if correction_style == "asterisk": return f"{typo}*{word}*" elif correction_style == "dash": return f"{typo}-{word}" else: return f"{typo}, I mean {word}" def _generate_filled_pause(self, word): """Generate a filled pause (e.g., "um, like").""" filler = random.choice(self.filler_words['hesitation']) return f"{filler}, {word}" def _generate_agreement_error(self, word): """Generate a subject-verb agreement error (simplified).""" if word.endswith('s') and len(word) > 3: return word[:-1] elif not word.endswith('s') and random.random() < 0.5: return word + 's' return word # Enhanced sentence-level transformations def merge_sentences(self, sent1, sent2): """Merge two sentences with a conjunction.""" # Remove the period from the first sentence if sent1.endswith('.'): sent1 = sent1[:-1] # Choose a conjunction based on the content doc1 = self.nlp(sent1) doc2 = self.nlp(sent2) # Check for content relationship similarity = doc1.similarity(doc2) if similarity > 0.7: # Highly similar, use addition conjunction = random.choice(["and", "also", "moreover", "furthermore"]) elif similarity < 0.3: # Dissimilar, use contrast conjunction = random.choice(["but", "however", "on the other hand", "yet"]) else: # Moderate similarity, use general conjunction conjunction = random.choice(["and", "while", "so", "because", "although"]) # Merge the sentences return f"{sent1} {conjunction} {sent2.lower() if sent2 and sent2[0].isupper() else sent2}" def split_sentences(self, sent, _): """Split a longer sentence into two with improved linguistic awareness.""" doc = self.nlp(sent) tokens = [token for token in doc] # Only split if sentence is long enough if len(tokens) < 8: return sent # Find a good split point based on dependency structure potential_splits = [] for i, token in enumerate(tokens): # Good split points are often after conjunctions or punctuation if (token.dep_ in ['cc', 'prep', 'mark'] or token.pos_ == 'PUNCT') and 3 < i < len(tokens) - 3: potential_splits.append((i, 1)) # Higher weight for these # Or before a new clause elif token.dep_ in ['nsubj', 'nsubjpass'] and i > 3: potential_splits.append((i, 0.8)) if not potential_splits: # Fallback to middle split_point = len(tokens) // 2 else: # Choose a split point with weighted random selection points, weights = zip(*potential_splits) split_point = random.choices(points, weights=weights)[0] # Create two new sentences sent1 = "".join([t.text_with_ws for t in tokens[:split_point]]) sent2 = "".join([t.text_with_ws for t in tokens[split_point:]]) # Ensure proper capitalization and punctuation sent1 = sent1.rstrip() if not sent1.endswith(('.', '!', '?')): sent1 += '.' sent2 = sent2.strip() if sent2 and sent2[0].islower(): sent2 = sent2[0].upper() + sent2[1:] return f"{sent1} {sent2}" def passive_to_active(self, sent, _): """Convert passive voice to active voice using spaCy's dependency parsing.""" doc = self.nlp(sent) # Look for passive constructions for token in doc: if token.dep_ == "nsubjpass": # Found passive voice subject = token agent = None verb = token.head # Find the agent (often introduced by "by") for child in doc: if child.dep_ == "agent" and child.head == verb: for grandchild in child.children: if grandchild.dep_ in ["pobj", "nmod"]: agent = grandchild break if agent: # Extract the core components subj_span = doc[subject.left_edge.i:subject.right_edge.i+1].text verb_span = doc[verb.i:verb.i+1].text agent_span = doc[agent.left_edge.i:agent.right_edge.i+1].text # Reconstruct in active voice active_verb = verb_span.replace("was ", "").replace("were ", "") # Remove trailing period for reconstruction if sent.endswith('.'): new_sent = f"{agent_span} {active_verb} {subj_span}." else: new_sent = f"{agent_span} {active_verb} {subj_span}" return new_sent # If no passive construction found or couldn't convert return sent def active_to_passive(self, sent, _): """Convert active voice to passive voice using spaCy's dependency parsing.""" doc = self.nlp(sent) # Look for active voice constructions for token in doc: if token.dep_ == "nsubj" and token.head.pos_ == "VERB": # Found a subject and verb subject = token verb = token.head # Find the direct object obj = None for child in verb.children: if child.dep_ in ["dobj", "obj"]: obj = child break if obj: # Extract the core components subj_span = doc[subject.left_edge.i:subject.right_edge.i+1].text verb_span = doc[verb.i:verb.i+1].text obj_span = doc[obj.left_edge.i:obj.right_edge.i+1].text # Determine the passive verb form passive_verb = verb_span if verb_span.endswith("s"): passive_verb = passive_verb[:-1] # Reconstruct in passive voice # Remove trailing period for reconstruction if sent.endswith('.'): new_sent = f"{obj_span} was {passive_verb}ed by {subj_span}." else: new_sent = f"{obj_span} was {passive_verb}ed by {subj_span}" return new_sent # If no active construction found or couldn't convert return sent def add_hedging(self, sent, _): """Add hedging language to a statement.""" # Add a hedging phrase at the beginning of the sentence hedging = random.choice(self.hedging_phrases) # For questions, add hedging at the end if sent.endswith('?'): return f"{sent[:-1]}, {hedging.lower()}?" # For statements, add at the beginning if sent[0].isupper(): return f"{hedging}, {sent[0].lower() + sent[1:]}" return f"{hedging}, {sent}" def add_intensifiers(self, sent, _): """Add intensifiers to adjectives and adverbs.""" doc = self.nlp(sent) words = list(doc) result = [] for i, token in enumerate(words): # Add intensifier before adjectives and adverbs if token.pos_ in ["ADJ", "ADV"] and random.random() < 0.6: # Choose an appropriate intensifier intensifier = random.choice(self.intensifiers) # Add the intensifier before the adjective/adverb result.append(intensifier) # Add the current token result.append(token.text) return " ".join(result) def add_rhetorical_question(self, sent, _): """Add a rhetorical question related to the statement.""" # Create a rhetorical question based on the content doc = self.nlp(sent) # Extract key information subjects = [tok for tok in doc if tok.dep_ in ["nsubj", "nsubjpass"]] if subjects and random.random() < 0.7: subject = subjects[0].text # Various question templates templates = [ f"Isn't that interesting about {subject}?", f"Don't you think so?", f"Right?", f"You know what I mean?", f"Can you imagine?", f"Who would have thought?", f"Why is that so important?" ] return f"{sent} {random.choice(templates)}" return sent def add_aside_comment(self, sent, _): """Add a parenthetical aside or comment.""" # Inject an aside comment in the middle or end of the sentence doc = self.nlp(sent) words = [token.text for token in doc] # Choose position for the aside if len(words) > 5: position = random.randint(3, len(words) - 2) if len(words) > 5 else len(words) else: # If sentence is too short, add at the end position = len(words) # Create aside comments asides = [ "by the way", "if you ask me", "I think", "you know", "to be honest", "believe it or not", "interestingly", "surprisingly", "and this is important" ] aside = random.choice(asides) # Insert the aside if position < len(words): # Insert in the middle, with commas words.insert(position, f", {aside},") else: # Add at the end if sent.endswith('.'): words[-1] = words[-1][:-1] # Remove the period words.append(f", {aside}.") else: words.append(f", {aside}") return " ".join(words) # Word-level transformations def contextual_synonym_replacement(self, token, doc): """Replace a word with a contextually appropriate synonym.""" # Only replace content words if token.pos_ not in ["NOUN", "VERB", "ADJ", "ADV"] or token.is_stop: return token.text # Find synonyms using WordNet synonyms = [] for syn in wordnet.synsets(token.text): for lemma in syn.lemmas(): synonym = lemma.name().replace('_', ' ') if synonym != token.text and synonym not in synonyms: synonyms.append(synonym) # If no synonyms found, return original if not synonyms: return token.text # Filter synonyms that fit the context filtered_synonyms = [] for synonym in synonyms[:5]: # Limit checking to 5 synonyms for efficiency # Create a new document with the synonym new_text = doc.text.replace(token.text, synonym) new_doc = self.nlp(new_text) # Calculate similarity between original and modified text similarity = doc.similarity(new_doc) if similarity > 0.8: # High semantic similarity threshold filtered_synonyms.append((synonym, similarity)) # If no good contextual synonyms, return original if not filtered_synonyms: return token.text # Sort by similarity and choose one of the top options filtered_synonyms.sort(key=lambda x: x[1], reverse=True) return random.choice(filtered_synonyms[:3])[0] def contraction_expansion(self, token): """Toggle between contractions and their expansions.""" if token.text.lower() in self.contractions: # Expand a contraction return self.contractions[token.text.lower()] elif token.text.lower() in self.expansions: # Contract an expansion return self.expansions[token.text.lower()] return token.text def add_filler_words(self, token): """Add filler words appropriate to the context.""" # Determine appropriate filler category based on token properties filler_category = None if token.pos_ == "ADJ": filler_category = random.choice(["emphasis", "hedging"]) elif token.pos_ == "VERB": filler_category = random.choice(["hesitation", "emphasis"]) elif token.pos_ == "NOUN": filler_category = random.choice(["clarification", "informal"]) else: filler_category = random.choice(list(self.filler_words.keys())) filler = random.choice(self.filler_words[filler_category]) # Add the filler before the token return f"{filler} {token.text}" def informal_substitution(self, token): """Replace formal words with informal alternatives.""" if token.text.lower() in self.informal_words: return random.choice(self.informal_words[token.text.lower()]) return token.text def add_emphatic_repetition(self, token): """Add emphatic repetition for emphasis.""" # Only repeat certain word types if token.pos_ in ["ADJ", "ADV"] and len(token.text) > 2: # Choose repetition style style = random.choice(["hyphen", "comma", "simple"]) if style == "hyphen": return f"{token.text}-{token.text}" elif style == "comma": return f"{token.text}, {token.text}" else: return f"{token.text} {token.text}" return token.text # Paragraph-level transformations def add_discourse_markers(self, paragraph, profile): """Add discourse markers to enhance cohesion.""" sentences = sent_tokenize(paragraph) if len(sentences) <= 1: return paragraph # Determine appropriate markers based on content marker_types = list(self.discourse_markers.keys()) weighted_types = random.choices( marker_types, weights=[0.2, 0.2, 0.2, 0.15, 0.1, 0.1, 0.05], k=min(len(sentences)-1, 3) # Don't add too many markers ) # Add markers to random sentences num_markers = min(len(sentences) - 1, max(1, int(len(sentences) * 0.5))) positions = sorted(random.sample(range(1, len(sentences)), num_markers)) for i, pos in enumerate(positions): marker_type = weighted_types[i % len(weighted_types)] marker = random.choice(self.discourse_markers[marker_type]) # Add the marker at the beginning of the sentence sentences[pos] = f"{marker}, {sentences[pos][0].lower() + sentences[pos][1:]}" return " ".join(sentences) def adjust_formality(self, paragraph, profile): """Adjust the overall formality of the paragraph.""" formality_level = profile.get('informal_rate', 0.5) # For formal text (low informality) if formality_level < 0.3: # Replace contractions with expansions for contraction, expansion in self.contractions.items(): pattern = r'\b' + re.escape(contraction) + r'\b' paragraph = re.sub(pattern, expansion, paragraph, flags=re.IGNORECASE) # Remove certain informal phrases informal_phrases = ["you know", "like", "kinda", "sort of", "pretty much"] for phrase in informal_phrases: paragraph = re.sub(r'\b' + re.escape(phrase) + r'\b', '', paragraph, flags=re.IGNORECASE) # For informal text (high informality) elif formality_level > 0.7: # Replace formal words with informal alternatives for formal, informals in self.informal_words.items(): pattern = r'\b' + re.escape(formal) + r'\b' if random.random() < 0.7: replacement = random.choice(informals) paragraph = re.sub(pattern, replacement, paragraph, flags=re.IGNORECASE) # Add contractions for expansion, contraction in self.expansions.items(): if ' ' in expansion: # Only multi-word expansions pattern = r'\b' + re.escape(expansion) + r'\b' paragraph = re.sub(pattern, contraction, paragraph, flags=re.IGNORECASE) return paragraph def add_cohesion_devices(self, paragraph, profile): """Add cohesion devices like pronouns and references.""" sentences = sent_tokenize(paragraph) if len(sentences) <= 1: return paragraph # Parse the paragraph doc = self.nlp(paragraph) # Extract key entities entities = {} for ent in doc.ents: if ent.label_ in ["PERSON", "ORG", "GPE", "LOC", "PRODUCT"]: if ent.text not in entities: entities[ent.text] = [] entities[ent.text].append(ent.label_) # Extract key nouns nouns = [token.text for token in doc if token.pos_ == "NOUN" and len(token.text) > 3] # If no entities or nouns found, return original if not entities and not nouns: return paragraph # Choose an entity or noun to reference reference_subject = None if entities: reference_subject = random.choice(list(entities.keys())) elif nouns: reference_subject = random.choice(nouns) if not reference_subject: return paragraph # Choose a reference type ref_type = random.choice(["pronoun", "determiner", "repetition"]) # Apply the reference in a later sentence if ref_type == "pronoun": # Simple pronoun substitution (could be improved with gender recognition) pronouns = ["it", "they", "this", "these", "that", "those"] pronoun = random.choice(pronouns) # Find a sentence with the reference subject for i, sent in enumerate(sentences): if reference_subject in sent and i < len(sentences) - 1: # Replace in the next sentence if possible next_sent = sentences[i+1] if reference_subject in next_sent: sentences[i+1] = next_sent.replace(reference_subject, pronoun, 1) break elif ref_type == "determiner": # Add a determiner phrase determiners = ["this", "that", "these", "those", "the", "such a"] determiner = random.choice(determiners) # Find a sentence with the reference subject for i, sent in enumerate(sentences): if reference_subject in sent and i < len(sentences) - 1: # Add in the next sentence if possible sentences[i+1] = sentences[i+1].replace( reference_subject, f"{determiner} {reference_subject}", 1 ) break return " ".join(sentences) # Example usage if __name__ == "__main__": # Initialize the humanizer humanizer = EnhancedTextHumanizer() # Test text original_text = """ Artificial intelligence has significantly impacted numerous industries. It has improved efficiency in manufacturing through automation. The healthcare sector has benefited from better diagnostic tools. Machine learning algorithms continue to advance and provide new solutions. Companies invest heavily in AI research and development. """ # Test different personality types for personality in ['casual', 'formal', 'academic', 'enthusiastic']: print(f"\n--- {personality.upper()} PERSONALITY ---") humanized = humanizer.humanize_text( original_text, intensity=0.7, personality=personality ) print(humanized) # Test with regional dialect print("\n--- REGIONAL DIALECT (US SOUTH) ---") humanized = humanizer.humanize_text( original_text, intensity=0.7, personality='casual', regional_dialect='us_south' ) print(humanized) # Test with emotional tone print("\n--- EMOTIONAL TONE (POSITIVE) ---") humanized = humanizer.humanize_text( original_text, intensity=0.7, personality='enthusiastic', emotional_tone='positive' ) print(humanized)