Spaces:
Build error
Build error
| import nltk | |
| import random | |
| import re | |
| import spacy | |
| import numpy as np | |
| from nltk.corpus import wordnet | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from nltk.tag import pos_tag | |
| from collections import defaultdict | |
| from transformers import pipeline | |
| spacy.load('en_core_web_sm') | |
| # With this code: | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| nltk.data.find('taggers/averaged_perceptron_tagger') | |
| nltk.data.find('corpora/wordnet') | |
| except LookupError as e: | |
| print(f"NLTK resource error: {e}") | |
| print("Attempting to download missing resources...") | |
| nltk.download('punkt') | |
| nltk.download('punkt_tab') | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('wordnet') | |
| class EnhancedTextHumanizer: | |
| def __init__(self): | |
| # Load spaCy model | |
| self.nlp = spacy.load("en_core_web_sm") | |
| # Initialize sentiment analysis pipeline - for emotional variations | |
| try: | |
| self.sentiment_analyzer = pipeline("sentiment-analysis") | |
| except: | |
| print("Warning: Transformers sentiment analysis not available. Emotional variations will be limited.") | |
| self.sentiment_analyzer = None | |
| # Sentence-level transformations | |
| self.sentence_transformations = [ | |
| self.merge_sentences, | |
| self.split_sentences, | |
| self.passive_to_active, | |
| self.active_to_passive, | |
| self.add_hedging, | |
| self.add_intensifiers, | |
| self.add_rhetorical_question, | |
| self.add_aside_comment | |
| ] | |
| # Word-level transformations | |
| self.word_transformations = [ | |
| self.contextual_synonym_replacement, | |
| self.contraction_expansion, | |
| self.add_filler_words, | |
| self.informal_substitution, | |
| self.add_emphatic_repetition | |
| ] | |
| # Paragraph-level transformations | |
| self.paragraph_transformations = [ | |
| self.add_discourse_markers, | |
| self.adjust_formality, | |
| self.add_cohesion_devices | |
| ] | |
| # Filler words with categorization | |
| self.filler_words = { | |
| 'hesitation': ["um", "uh", "er", "hmm", "like"], | |
| 'emphasis': ["actually", "literally", "seriously", "honestly", "truly", "really"], | |
| 'hedging': ["maybe", "perhaps", "probably", "possibly", "somewhat", "kinda", "sort of"], | |
| 'clarification': ["I mean", "you know", "what I'm saying", "in other words"], | |
| 'informal': ["basically", "totally", "absolutely", "pretty much", "y'know"] | |
| } | |
| # Contractions and their expansions | |
| self.contractions = { | |
| "can't": "cannot", "won't": "will not", "don't": "do not", | |
| "doesn't": "does not", "I'm": "I am", "you're": "you are", | |
| "they're": "they are", "we're": "we are", "it's": "it is", | |
| "that's": "that is", "who's": "who is", "what's": "what is", | |
| "there's": "there is", "here's": "here is", "he's": "he is", | |
| "she's": "she is", "I've": "I have", "you've": "you have", | |
| "we've": "we have", "they've": "they have", "I'll": "I will", | |
| "you'll": "you will", "he'll": "he will", "she'll": "she will", | |
| "we'll": "we will", "they'll": "they will", "I'd": "I would", | |
| "you'd": "you would", "he'd": "he would", "she'd": "she would", | |
| "we'd": "we would", "they'd": "they would", "let's": "let us", | |
| "ain't": "am not", "wasn't": "was not", "weren't": "were not", | |
| "hasn't": "has not", "haven't": "have not", "couldn't": "could not", | |
| "shouldn't": "should not", "wouldn't": "would not", "didn't": "did not", | |
| "isn't": "is not", "aren't": "are not", "mightn't": "might not", | |
| "mustn't": "must not", "shan't": "shall not", "needn't": "need not" | |
| } | |
| # Reverse contractions dictionary | |
| self.expansions = {v: k for k, v in self.contractions.items()} | |
| # Common hedging phrases | |
| self.hedging_phrases = [ | |
| "I think", "It seems", "It appears", "From what I understand", | |
| "As far as I know", "In my opinion", "Arguably", "Presumably", | |
| "It could be that", "It's possible that", "One might say", | |
| "To some extent", "More or less", "Kind of", "Sort of" | |
| ] | |
| # Intensifiers for emotional emphasis | |
| self.intensifiers = [ | |
| "very", "extremely", "incredibly", "remarkably", "absolutely", | |
| "totally", "completely", "utterly", "entirely", "thoroughly", | |
| "ridiculously", "insanely", "super", "really", "quite", | |
| "unbelievably", "amazingly", "surprisingly", "exceptionally" | |
| ] | |
| # Discourse markers for transitions | |
| self.discourse_markers = { | |
| 'contrast': ["however", "but", "nevertheless", "on the other hand", "conversely", "in contrast", "yet"], | |
| 'addition': ["moreover", "furthermore", "additionally", "also", "besides", "in addition", "plus"], | |
| 'cause_effect': ["therefore", "consequently", "thus", "hence", "as a result", "so", "because of this"], | |
| 'sequence': ["first", "second", "next", "then", "finally", "subsequently", "later"], | |
| 'example': ["for example", "for instance", "specifically", "to illustrate", "such as", "namely"], | |
| 'conclusion': ["in conclusion", "to sum up", "in summary", "overall", "ultimately", "in the end"], | |
| 'emphasis': ["indeed", "certainly", "in fact", "obviously", "clearly", "notably", "significantly"] | |
| } | |
| # Informal substitutions | |
| self.informal_words = { | |
| "approximately": ["about", "around"], | |
| "assistance": ["help", "a hand"], | |
| "attempt": ["try", "shot", "stab"], | |
| "communicate": ["talk", "chat", "get in touch"], | |
| "comprehend": ["get", "understand"], | |
| "concerning": ["about", "on"], | |
| "consume": ["eat", "drink", "use up"], | |
| "currently": ["now", "right now"], | |
| "decrease": ["drop", "cut", "fall"], | |
| "difficult": ["hard", "tough"], | |
| "encounter": ["meet", "run into", "bump into"], | |
| "endeavor": ["try", "take a shot"], | |
| "excessive": ["too much", "over the top"], | |
| "expedite": ["speed up", "hurry"], | |
| "facilitate": ["help", "make easier"], | |
| "frequently": ["often", "a lot"], | |
| "fundamental": ["basic", "key"], | |
| "utilize": ["use"], | |
| "purchase": ["buy", "get"], | |
| "sufficient": ["enough"], | |
| "inquire": ["ask"], | |
| "obtain": ["get"], | |
| "require": ["need"], | |
| "additional": ["more", "extra"], | |
| "residence": ["home", "place"] | |
| } | |
| # Common speech error patterns | |
| self.speech_errors = { | |
| 'restarts': self._generate_restart, | |
| 'repetitions': self._generate_repetition, | |
| 'corrections': self._generate_correction, | |
| 'filled_pauses': self._generate_filled_pause, | |
| 'agreement_errors': self._generate_agreement_error | |
| } | |
| # Regional dialect variations (simplified) | |
| self.regional_variations = { | |
| 'us_south': { | |
| 'you all': "y'all", | |
| 'going to': "gonna", | |
| 'want to': "wanna", | |
| 'did not': "didn't", | |
| 'yes': "yep", | |
| 'no': "nope" | |
| }, | |
| 'british': { | |
| 'apartment': "flat", | |
| 'elevator': "lift", | |
| 'trash': "rubbish", | |
| 'sidewalk': "pavement", | |
| 'vacation': "holiday", | |
| 'soccer': "football" | |
| } | |
| } | |
| # Personality profiles (simplified) | |
| self.personality_profiles = { | |
| 'casual': { | |
| 'contraction_rate': 0.8, | |
| 'informal_rate': 0.7, | |
| 'hedging_rate': 0.3, | |
| 'filler_rate': 0.4, | |
| 'error_rate': 0.1 | |
| }, | |
| 'formal': { | |
| 'contraction_rate': 0.2, | |
| 'informal_rate': 0.1, | |
| 'hedging_rate': 0.5, | |
| 'filler_rate': 0.1, | |
| 'error_rate': 0.05 | |
| }, | |
| 'academic': { | |
| 'contraction_rate': 0.1, | |
| 'informal_rate': 0.05, | |
| 'hedging_rate': 0.6, | |
| 'filler_rate': 0.1, | |
| 'error_rate': 0.02 | |
| }, | |
| 'enthusiastic': { | |
| 'contraction_rate': 0.6, | |
| 'informal_rate': 0.5, | |
| 'hedging_rate': 0.2, | |
| 'filler_rate': 0.3, | |
| 'error_rate': 0.1, | |
| 'intensifier_rate': 0.7 | |
| } | |
| } | |
| # Emotional expression templates | |
| self.emotional_expressions = { | |
| 'positive': [ | |
| "I'm so happy about {topic}!", | |
| "This is amazing: {sentence}", | |
| "I love how {sentence}", | |
| "Wow, {sentence} That's incredible!", | |
| "I'm really excited about {topic}." | |
| ], | |
| 'negative': [ | |
| "I'm not too thrilled about {topic}.", | |
| "Unfortunately, {sentence}", | |
| "I'm concerned that {sentence}", | |
| "This is disappointing: {sentence}", | |
| "I'm a bit worried about {topic}." | |
| ], | |
| 'neutral': [ | |
| "In my view, {sentence}", | |
| "I think {sentence}", | |
| "From what I understand, {sentence}", | |
| "My take on {topic} is that {sentence}", | |
| "When it comes to {topic}, {sentence}" | |
| ] | |
| } | |
| def humanize_text(self, text, intensity=0.5, personality='casual', add_errors=True, regional_dialect=None, emotional_tone=None): | |
| """ | |
| Enhanced main function to humanize text with multiple parameters for customization. | |
| Args: | |
| text (str): The input text to humanize | |
| intensity (float): Controls how much the text is transformed (0.0 to 1.0) | |
| personality (str): Personality profile to use ('casual', 'formal', 'academic', 'enthusiastic') | |
| add_errors (bool): Whether to add realistic speech/typing errors | |
| regional_dialect (str): Regional dialect to incorporate (None, 'us_south', 'british') | |
| emotional_tone (str): Overall emotional tone (None, 'positive', 'negative', 'neutral') | |
| Returns: | |
| str: Humanized text | |
| """ | |
| if intensity < 0 or intensity > 1: | |
| raise ValueError("Intensity must be between 0.0 and 1.0") | |
| # Apply personality profile | |
| profile = self.personality_profiles.get(personality, self.personality_profiles['casual']) | |
| # Parse the text with spaCy for better linguistic analysis | |
| doc = self.nlp(text) | |
| # Split text into paragraphs | |
| paragraphs = [p.strip() for p in text.split('\n') if p.strip()] | |
| # Apply paragraph-level transformations | |
| transformed_paragraphs = [] | |
| for para in paragraphs: | |
| # Analyze sentiment if available | |
| sentiment = self._analyze_sentiment(para) if emotional_tone is None and self.sentiment_analyzer else None | |
| current_tone = emotional_tone or (sentiment['label'].lower() if sentiment else 'neutral') | |
| # Add emotional expressions based on tone | |
| if random.random() < intensity * 0.3 and current_tone in self.emotional_expressions: | |
| # Find a topic in the paragraph | |
| topic = self._extract_topic(para) | |
| expression = random.choice(self.emotional_expressions[current_tone]) | |
| para = expression.format(topic=topic, sentence=para.lower() if para[0].isupper() else para) | |
| # Apply paragraph transformations | |
| for transform in self.paragraph_transformations: | |
| if random.random() < intensity * 0.4: | |
| para = transform(para, profile) | |
| # Split paragraph into sentences | |
| sentences = sent_tokenize(para) | |
| # Apply sentence-level transformations | |
| transformed_sentences = self._apply_sentence_transformations(sentences, intensity, profile) | |
| # Apply regional dialect if specified | |
| if regional_dialect and regional_dialect in self.regional_variations: | |
| transformed_para = " ".join(transformed_sentences) | |
| for original, variant in self.regional_variations[regional_dialect].items(): | |
| # Use word boundaries to avoid partial replacements | |
| pattern = r'\b' + re.escape(original) + r'\b' | |
| if random.random() < intensity * 0.7: | |
| transformed_para = re.sub(pattern, variant, transformed_para, flags=re.IGNORECASE) | |
| transformed_paragraphs.append(transformed_para) | |
| else: | |
| transformed_paragraphs.append(" ".join(transformed_sentences)) | |
| # Join paragraphs | |
| result = "\n\n".join(transformed_paragraphs) | |
| # Apply word-level transformations | |
| result = self._apply_word_transformations(result, intensity, profile) | |
| # Introduce speech errors if enabled | |
| if add_errors and intensity > 0.2: | |
| result = self._introduce_speech_errors(result, intensity * profile.get('error_rate', 0.1)) | |
| # Normalize spacing around punctuation | |
| result = self._normalize_spacing(result) | |
| return result | |
| def _analyze_sentiment(self, text): | |
| """Analyze sentiment of the text using the sentiment analyzer.""" | |
| if self.sentiment_analyzer: | |
| try: | |
| return self.sentiment_analyzer(text)[0] | |
| except: | |
| pass | |
| return None | |
| def _extract_topic(self, text): | |
| """Extract a potential topic from the text using spaCy.""" | |
| doc = self.nlp(text) | |
| # Try to find entities | |
| entities = list(doc.ents) | |
| if entities: | |
| return entities[0].text | |
| # Try to find noun chunks | |
| chunks = list(doc.noun_chunks) | |
| if chunks: | |
| return chunks[0].text | |
| # Fallback to first sentence | |
| sentences = sent_tokenize(text) | |
| if sentences: | |
| words = word_tokenize(sentences[0]) | |
| if words: | |
| return words[0] | |
| return "this" | |
| def _apply_sentence_transformations(self, sentences, intensity, profile): | |
| """Apply various sentence-level transformations with personality profile influence.""" | |
| result = [] | |
| i = 0 | |
| while i < len(sentences): | |
| # Randomly decide whether to apply a transformation | |
| if random.random() < intensity * 0.7: | |
| # Weight transformations based on personality | |
| weights = [ | |
| 1.0, # merge_sentences | |
| 0.8, # split_sentences | |
| 0.5 if profile.get('hedging_rate', 0.3) > 0.4 else 0.2, # passive_to_active | |
| 0.2 if profile.get('hedging_rate', 0.3) > 0.4 else 0.5, # active_to_passive | |
| profile.get('hedging_rate', 0.3), # add_hedging | |
| profile.get('intensifier_rate', 0.4), # add_intensifiers | |
| 0.3 if profile.get('informal_rate', 0.5) > 0.5 else 0.1, # add_rhetorical_question | |
| 0.4 if profile.get('informal_rate', 0.5) > 0.4 else 0.2, # add_aside_comment | |
| ] | |
| # Normalize weights | |
| weights = [w / sum(weights) for w in weights] | |
| # Choose a transformation based on weights | |
| transformation = random.choices(self.sentence_transformations, weights=weights)[0] | |
| # For transformations requiring two sentences | |
| if transformation in [self.merge_sentences] and i < len(sentences) - 1: | |
| transformed = transformation(sentences[i], sentences[i+1]) | |
| result.append(transformed) | |
| i += 2 | |
| # For transformations requiring one sentence | |
| else: | |
| transformed = transformation(sentences[i], "") | |
| result.append(transformed) | |
| i += 1 | |
| else: | |
| result.append(sentences[i]) | |
| i += 1 | |
| return result | |
| def _apply_word_transformations(self, text, intensity, profile): | |
| """Apply various word-level transformations with personality profile influence.""" | |
| # Parse the text with spaCy for better context | |
| doc = self.nlp(text) | |
| # Apply transformations separately to preserve sentence structure | |
| sentences = [sent.text for sent in doc.sents] | |
| transformed_sentences = [] | |
| for sentence in sentences: | |
| # Tokenize sentence | |
| sentence_doc = self.nlp(sentence) | |
| # Build a new sentence from tokens | |
| new_tokens = [] | |
| i = 0 | |
| while i < len(sentence_doc): | |
| token = sentence_doc[i] | |
| # Skip punctuation for most transformations | |
| if token.is_punct: | |
| new_tokens.append(token.text) | |
| i += 1 | |
| continue | |
| # Randomly decide whether to apply a transformation | |
| if random.random() < intensity * 0.5: | |
| # Weight transformations based on personality | |
| weights = [ | |
| 0.6, # contextual_synonym_replacement | |
| profile.get('contraction_rate', 0.5), # contraction_expansion | |
| profile.get('filler_rate', 0.3), # add_filler_words | |
| profile.get('informal_rate', 0.5), # informal_substitution | |
| 0.3 if profile.get('intensifier_rate', 0.4) > 0.5 else 0.1, # add_emphatic_repetition | |
| ] | |
| # Normalize weights | |
| weights = [w / sum(weights) for w in weights] | |
| # Choose a transformation based on weights | |
| transformation = random.choices(self.word_transformations, weights=weights)[0] | |
| # Apply transformation | |
| if transformation == self.contextual_synonym_replacement: | |
| transformed = transformation(token, sentence_doc) | |
| elif transformation == self.contraction_expansion: | |
| # Need to check if this is a multi-word expansion | |
| if token.text.lower() in self.expansions: | |
| # This is a potential expansion point | |
| expansion = self.expansions[token.text.lower()] | |
| if ' ' in expansion and i < len(sentence_doc) - 1: | |
| # Check if the next tokens match the expansion | |
| expansion_parts = expansion.split() | |
| if expansion_parts[0].lower() == token.text.lower() and expansion_parts[1].lower() == sentence_doc[i+1].text.lower(): | |
| # Apply contraction | |
| transformed = expansion | |
| i += 1 # Skip the next token | |
| else: | |
| transformed = token.text | |
| else: | |
| transformed = token.text | |
| else: | |
| transformed = transformation(token) | |
| elif transformation == self.add_filler_words: | |
| # Add a filler word before the current word | |
| if random.random() < 0.3: # Only occasionally add fillers | |
| filler_category = random.choice(list(self.filler_words.keys())) | |
| filler = random.choice(self.filler_words[filler_category]) | |
| new_tokens.append(filler) | |
| transformed = token.text | |
| elif transformation == self.informal_substitution: | |
| transformed = transformation(token) | |
| elif transformation == self.add_emphatic_repetition: | |
| transformed = transformation(token) | |
| else: | |
| transformed = token.text | |
| new_tokens.append(transformed) | |
| else: | |
| new_tokens.append(token.text) | |
| i += 1 | |
| # Recreate the sentence from tokens | |
| transformed_sentence = self._reconstruct_sentence(new_tokens) | |
| transformed_sentences.append(transformed_sentence) | |
| # Join the transformed sentences | |
| result = " ".join(transformed_sentences) | |
| return result | |
| def _reconstruct_sentence(self, tokens): | |
| """Reconstruct a sentence from tokens, preserving proper spacing.""" | |
| result = "" | |
| for i, token in enumerate(tokens): | |
| # Handle special cases for punctuation | |
| if token in ".,!?;:)]}" and result: | |
| result = result.rstrip() + token + " " | |
| # Don't add space after opening brackets | |
| elif i > 0 and tokens[i-1] in "([{" and result: | |
| result = result.rstrip() + token + " " | |
| elif token in "([{" and result: | |
| result = result.rstrip() + token | |
| # Handle quotes | |
| elif token in ['\'', '"'] and result and result[-1] != " ": | |
| result += token + " " | |
| else: | |
| result += token + " " | |
| return result.strip() | |
| def _introduce_speech_errors(self, text, error_rate): | |
| """Introduce realistic speech/typing errors.""" | |
| words = text.split() | |
| result = [] | |
| for i, word in enumerate(words): | |
| if len(word) > 2 and random.random() < error_rate: | |
| # Select a random error type | |
| error_type = random.choice(list(self.speech_errors.keys())) | |
| # Apply the error | |
| error_func = self.speech_errors[error_type] | |
| if error_type in ['restarts', 'repetitions'] and i > 0: | |
| # These errors need previous context | |
| modified = error_func(words[i-1], word) | |
| result.pop() # Remove the previous word | |
| result.append(modified) | |
| else: | |
| result.append(error_func(word)) | |
| else: | |
| result.append(word) | |
| return " ".join(result) | |
| def _normalize_spacing(self, text): | |
| """Fix spacing around punctuation for a more natural look.""" | |
| # Fix spacing around punctuation | |
| text = re.sub(r'\s+([.,;:!?)])', r'\1', text) | |
| text = re.sub(r'([(])\s+', r'\1', text) | |
| # Fix multiple spaces | |
| text = re.sub(r'\s{2,}', ' ', text) | |
| return text | |
| # Speech error generators | |
| def _generate_restart(self, prev_word, word): | |
| """Generate a restart error (e.g., "I was- I was saying").""" | |
| return f"{prev_word}- {prev_word} {word}" | |
| def _generate_repetition(self, prev_word, word): | |
| """Generate a word repetition (e.g., "the the cat").""" | |
| return f"{prev_word} {prev_word} {word}" | |
| def _generate_correction(self, word): | |
| """Generate a self-correction (e.g., "teh the").""" | |
| if len(word) < 4: | |
| return word | |
| # Create a simple typo | |
| i = random.randint(0, len(word) - 2) | |
| typo = word[:i] + word[i+1] + word[i] + word[i+2:] | |
| # Choose correction style | |
| correction_style = random.choice(["asterisk", "dash", "explicit"]) | |
| if correction_style == "asterisk": | |
| return f"{typo}*{word}*" | |
| elif correction_style == "dash": | |
| return f"{typo}-{word}" | |
| else: | |
| return f"{typo}, I mean {word}" | |
| def _generate_filled_pause(self, word): | |
| """Generate a filled pause (e.g., "um, like").""" | |
| filler = random.choice(self.filler_words['hesitation']) | |
| return f"{filler}, {word}" | |
| def _generate_agreement_error(self, word): | |
| """Generate a subject-verb agreement error (simplified).""" | |
| if word.endswith('s') and len(word) > 3: | |
| return word[:-1] | |
| elif not word.endswith('s') and random.random() < 0.5: | |
| return word + 's' | |
| return word | |
| # Enhanced sentence-level transformations | |
| def merge_sentences(self, sent1, sent2): | |
| """Merge two sentences with a conjunction.""" | |
| # Remove the period from the first sentence | |
| if sent1.endswith('.'): | |
| sent1 = sent1[:-1] | |
| # Choose a conjunction based on the content | |
| doc1 = self.nlp(sent1) | |
| doc2 = self.nlp(sent2) | |
| # Check for content relationship | |
| similarity = doc1.similarity(doc2) | |
| if similarity > 0.7: | |
| # Highly similar, use addition | |
| conjunction = random.choice(["and", "also", "moreover", "furthermore"]) | |
| elif similarity < 0.3: | |
| # Dissimilar, use contrast | |
| conjunction = random.choice(["but", "however", "on the other hand", "yet"]) | |
| else: | |
| # Moderate similarity, use general conjunction | |
| conjunction = random.choice(["and", "while", "so", "because", "although"]) | |
| # Merge the sentences | |
| return f"{sent1} {conjunction} {sent2.lower() if sent2 and sent2[0].isupper() else sent2}" | |
| def split_sentences(self, sent, _): | |
| """Split a longer sentence into two with improved linguistic awareness.""" | |
| doc = self.nlp(sent) | |
| tokens = [token for token in doc] | |
| # Only split if sentence is long enough | |
| if len(tokens) < 8: | |
| return sent | |
| # Find a good split point based on dependency structure | |
| potential_splits = [] | |
| for i, token in enumerate(tokens): | |
| # Good split points are often after conjunctions or punctuation | |
| if (token.dep_ in ['cc', 'prep', 'mark'] or token.pos_ == 'PUNCT') and 3 < i < len(tokens) - 3: | |
| potential_splits.append((i, 1)) # Higher weight for these | |
| # Or before a new clause | |
| elif token.dep_ in ['nsubj', 'nsubjpass'] and i > 3: | |
| potential_splits.append((i, 0.8)) | |
| if not potential_splits: | |
| # Fallback to middle | |
| split_point = len(tokens) // 2 | |
| else: | |
| # Choose a split point with weighted random selection | |
| points, weights = zip(*potential_splits) | |
| split_point = random.choices(points, weights=weights)[0] | |
| # Create two new sentences | |
| sent1 = "".join([t.text_with_ws for t in tokens[:split_point]]) | |
| sent2 = "".join([t.text_with_ws for t in tokens[split_point:]]) | |
| # Ensure proper capitalization and punctuation | |
| sent1 = sent1.rstrip() | |
| if not sent1.endswith(('.', '!', '?')): | |
| sent1 += '.' | |
| sent2 = sent2.strip() | |
| if sent2 and sent2[0].islower(): | |
| sent2 = sent2[0].upper() + sent2[1:] | |
| return f"{sent1} {sent2}" | |
| def passive_to_active(self, sent, _): | |
| """Convert passive voice to active voice using spaCy's dependency parsing.""" | |
| doc = self.nlp(sent) | |
| # Look for passive constructions | |
| for token in doc: | |
| if token.dep_ == "nsubjpass": | |
| # Found passive voice | |
| subject = token | |
| agent = None | |
| verb = token.head | |
| # Find the agent (often introduced by "by") | |
| for child in doc: | |
| if child.dep_ == "agent" and child.head == verb: | |
| for grandchild in child.children: | |
| if grandchild.dep_ in ["pobj", "nmod"]: | |
| agent = grandchild | |
| break | |
| if agent: | |
| # Extract the core components | |
| subj_span = doc[subject.left_edge.i:subject.right_edge.i+1].text | |
| verb_span = doc[verb.i:verb.i+1].text | |
| agent_span = doc[agent.left_edge.i:agent.right_edge.i+1].text | |
| # Reconstruct in active voice | |
| active_verb = verb_span.replace("was ", "").replace("were ", "") | |
| # Remove trailing period for reconstruction | |
| if sent.endswith('.'): | |
| new_sent = f"{agent_span} {active_verb} {subj_span}." | |
| else: | |
| new_sent = f"{agent_span} {active_verb} {subj_span}" | |
| return new_sent | |
| # If no passive construction found or couldn't convert | |
| return sent | |
| def active_to_passive(self, sent, _): | |
| """Convert active voice to passive voice using spaCy's dependency parsing.""" | |
| doc = self.nlp(sent) | |
| # Look for active voice constructions | |
| for token in doc: | |
| if token.dep_ == "nsubj" and token.head.pos_ == "VERB": | |
| # Found a subject and verb | |
| subject = token | |
| verb = token.head | |
| # Find the direct object | |
| obj = None | |
| for child in verb.children: | |
| if child.dep_ in ["dobj", "obj"]: | |
| obj = child | |
| break | |
| if obj: | |
| # Extract the core components | |
| subj_span = doc[subject.left_edge.i:subject.right_edge.i+1].text | |
| verb_span = doc[verb.i:verb.i+1].text | |
| obj_span = doc[obj.left_edge.i:obj.right_edge.i+1].text | |
| # Determine the passive verb form | |
| passive_verb = verb_span | |
| if verb_span.endswith("s"): | |
| passive_verb = passive_verb[:-1] | |
| # Reconstruct in passive voice | |
| # Remove trailing period for reconstruction | |
| if sent.endswith('.'): | |
| new_sent = f"{obj_span} was {passive_verb}ed by {subj_span}." | |
| else: | |
| new_sent = f"{obj_span} was {passive_verb}ed by {subj_span}" | |
| return new_sent | |
| # If no active construction found or couldn't convert | |
| return sent | |
| def add_hedging(self, sent, _): | |
| """Add hedging language to a statement.""" | |
| # Add a hedging phrase at the beginning of the sentence | |
| hedging = random.choice(self.hedging_phrases) | |
| # For questions, add hedging at the end | |
| if sent.endswith('?'): | |
| return f"{sent[:-1]}, {hedging.lower()}?" | |
| # For statements, add at the beginning | |
| if sent[0].isupper(): | |
| return f"{hedging}, {sent[0].lower() + sent[1:]}" | |
| return f"{hedging}, {sent}" | |
| def add_intensifiers(self, sent, _): | |
| """Add intensifiers to adjectives and adverbs.""" | |
| doc = self.nlp(sent) | |
| words = list(doc) | |
| result = [] | |
| for i, token in enumerate(words): | |
| # Add intensifier before adjectives and adverbs | |
| if token.pos_ in ["ADJ", "ADV"] and random.random() < 0.6: | |
| # Choose an appropriate intensifier | |
| intensifier = random.choice(self.intensifiers) | |
| # Add the intensifier before the adjective/adverb | |
| result.append(intensifier) | |
| # Add the current token | |
| result.append(token.text) | |
| return " ".join(result) | |
| def add_rhetorical_question(self, sent, _): | |
| """Add a rhetorical question related to the statement.""" | |
| # Create a rhetorical question based on the content | |
| doc = self.nlp(sent) | |
| # Extract key information | |
| subjects = [tok for tok in doc if tok.dep_ in ["nsubj", "nsubjpass"]] | |
| if subjects and random.random() < 0.7: | |
| subject = subjects[0].text | |
| # Various question templates | |
| templates = [ | |
| f"Isn't that interesting about {subject}?", | |
| f"Don't you think so?", | |
| f"Right?", | |
| f"You know what I mean?", | |
| f"Can you imagine?", | |
| f"Who would have thought?", | |
| f"Why is that so important?" | |
| ] | |
| return f"{sent} {random.choice(templates)}" | |
| return sent | |
| def add_aside_comment(self, sent, _): | |
| """Add a parenthetical aside or comment.""" | |
| # Inject an aside comment in the middle or end of the sentence | |
| doc = self.nlp(sent) | |
| words = [token.text for token in doc] | |
| # Choose position for the aside | |
| if len(words) > 5: | |
| position = random.randint(3, len(words) - 2) if len(words) > 5 else len(words) | |
| else: | |
| # If sentence is too short, add at the end | |
| position = len(words) | |
| # Create aside comments | |
| asides = [ | |
| "by the way", | |
| "if you ask me", | |
| "I think", | |
| "you know", | |
| "to be honest", | |
| "believe it or not", | |
| "interestingly", | |
| "surprisingly", | |
| "and this is important" | |
| ] | |
| aside = random.choice(asides) | |
| # Insert the aside | |
| if position < len(words): | |
| # Insert in the middle, with commas | |
| words.insert(position, f", {aside},") | |
| else: | |
| # Add at the end | |
| if sent.endswith('.'): | |
| words[-1] = words[-1][:-1] # Remove the period | |
| words.append(f", {aside}.") | |
| else: | |
| words.append(f", {aside}") | |
| return " ".join(words) | |
| # Word-level transformations | |
| def contextual_synonym_replacement(self, token, doc): | |
| """Replace a word with a contextually appropriate synonym.""" | |
| # Only replace content words | |
| if token.pos_ not in ["NOUN", "VERB", "ADJ", "ADV"] or token.is_stop: | |
| return token.text | |
| # Find synonyms using WordNet | |
| synonyms = [] | |
| for syn in wordnet.synsets(token.text): | |
| for lemma in syn.lemmas(): | |
| synonym = lemma.name().replace('_', ' ') | |
| if synonym != token.text and synonym not in synonyms: | |
| synonyms.append(synonym) | |
| # If no synonyms found, return original | |
| if not synonyms: | |
| return token.text | |
| # Filter synonyms that fit the context | |
| filtered_synonyms = [] | |
| for synonym in synonyms[:5]: # Limit checking to 5 synonyms for efficiency | |
| # Create a new document with the synonym | |
| new_text = doc.text.replace(token.text, synonym) | |
| new_doc = self.nlp(new_text) | |
| # Calculate similarity between original and modified text | |
| similarity = doc.similarity(new_doc) | |
| if similarity > 0.8: # High semantic similarity threshold | |
| filtered_synonyms.append((synonym, similarity)) | |
| # If no good contextual synonyms, return original | |
| if not filtered_synonyms: | |
| return token.text | |
| # Sort by similarity and choose one of the top options | |
| filtered_synonyms.sort(key=lambda x: x[1], reverse=True) | |
| return random.choice(filtered_synonyms[:3])[0] | |
| def contraction_expansion(self, token): | |
| """Toggle between contractions and their expansions.""" | |
| if token.text.lower() in self.contractions: | |
| # Expand a contraction | |
| return self.contractions[token.text.lower()] | |
| elif token.text.lower() in self.expansions: | |
| # Contract an expansion | |
| return self.expansions[token.text.lower()] | |
| return token.text | |
| def add_filler_words(self, token): | |
| """Add filler words appropriate to the context.""" | |
| # Determine appropriate filler category based on token properties | |
| filler_category = None | |
| if token.pos_ == "ADJ": | |
| filler_category = random.choice(["emphasis", "hedging"]) | |
| elif token.pos_ == "VERB": | |
| filler_category = random.choice(["hesitation", "emphasis"]) | |
| elif token.pos_ == "NOUN": | |
| filler_category = random.choice(["clarification", "informal"]) | |
| else: | |
| filler_category = random.choice(list(self.filler_words.keys())) | |
| filler = random.choice(self.filler_words[filler_category]) | |
| # Add the filler before the token | |
| return f"{filler} {token.text}" | |
| def informal_substitution(self, token): | |
| """Replace formal words with informal alternatives.""" | |
| if token.text.lower() in self.informal_words: | |
| return random.choice(self.informal_words[token.text.lower()]) | |
| return token.text | |
| def add_emphatic_repetition(self, token): | |
| """Add emphatic repetition for emphasis.""" | |
| # Only repeat certain word types | |
| if token.pos_ in ["ADJ", "ADV"] and len(token.text) > 2: | |
| # Choose repetition style | |
| style = random.choice(["hyphen", "comma", "simple"]) | |
| if style == "hyphen": | |
| return f"{token.text}-{token.text}" | |
| elif style == "comma": | |
| return f"{token.text}, {token.text}" | |
| else: | |
| return f"{token.text} {token.text}" | |
| return token.text | |
| # Paragraph-level transformations | |
| def add_discourse_markers(self, paragraph, profile): | |
| """Add discourse markers to enhance cohesion.""" | |
| sentences = sent_tokenize(paragraph) | |
| if len(sentences) <= 1: | |
| return paragraph | |
| # Determine appropriate markers based on content | |
| marker_types = list(self.discourse_markers.keys()) | |
| weighted_types = random.choices( | |
| marker_types, | |
| weights=[0.2, 0.2, 0.2, 0.15, 0.1, 0.1, 0.05], | |
| k=min(len(sentences)-1, 3) # Don't add too many markers | |
| ) | |
| # Add markers to random sentences | |
| num_markers = min(len(sentences) - 1, max(1, int(len(sentences) * 0.5))) | |
| positions = sorted(random.sample(range(1, len(sentences)), num_markers)) | |
| for i, pos in enumerate(positions): | |
| marker_type = weighted_types[i % len(weighted_types)] | |
| marker = random.choice(self.discourse_markers[marker_type]) | |
| # Add the marker at the beginning of the sentence | |
| sentences[pos] = f"{marker}, {sentences[pos][0].lower() + sentences[pos][1:]}" | |
| return " ".join(sentences) | |
| def adjust_formality(self, paragraph, profile): | |
| """Adjust the overall formality of the paragraph.""" | |
| formality_level = profile.get('informal_rate', 0.5) | |
| # For formal text (low informality) | |
| if formality_level < 0.3: | |
| # Replace contractions with expansions | |
| for contraction, expansion in self.contractions.items(): | |
| pattern = r'\b' + re.escape(contraction) + r'\b' | |
| paragraph = re.sub(pattern, expansion, paragraph, flags=re.IGNORECASE) | |
| # Remove certain informal phrases | |
| informal_phrases = ["you know", "like", "kinda", "sort of", "pretty much"] | |
| for phrase in informal_phrases: | |
| paragraph = re.sub(r'\b' + re.escape(phrase) + r'\b', '', paragraph, flags=re.IGNORECASE) | |
| # For informal text (high informality) | |
| elif formality_level > 0.7: | |
| # Replace formal words with informal alternatives | |
| for formal, informals in self.informal_words.items(): | |
| pattern = r'\b' + re.escape(formal) + r'\b' | |
| if random.random() < 0.7: | |
| replacement = random.choice(informals) | |
| paragraph = re.sub(pattern, replacement, paragraph, flags=re.IGNORECASE) | |
| # Add contractions | |
| for expansion, contraction in self.expansions.items(): | |
| if ' ' in expansion: # Only multi-word expansions | |
| pattern = r'\b' + re.escape(expansion) + r'\b' | |
| paragraph = re.sub(pattern, contraction, paragraph, flags=re.IGNORECASE) | |
| return paragraph | |
| def add_cohesion_devices(self, paragraph, profile): | |
| """Add cohesion devices like pronouns and references.""" | |
| sentences = sent_tokenize(paragraph) | |
| if len(sentences) <= 1: | |
| return paragraph | |
| # Parse the paragraph | |
| doc = self.nlp(paragraph) | |
| # Extract key entities | |
| entities = {} | |
| for ent in doc.ents: | |
| if ent.label_ in ["PERSON", "ORG", "GPE", "LOC", "PRODUCT"]: | |
| if ent.text not in entities: | |
| entities[ent.text] = [] | |
| entities[ent.text].append(ent.label_) | |
| # Extract key nouns | |
| nouns = [token.text for token in doc if token.pos_ == "NOUN" and len(token.text) > 3] | |
| # If no entities or nouns found, return original | |
| if not entities and not nouns: | |
| return paragraph | |
| # Choose an entity or noun to reference | |
| reference_subject = None | |
| if entities: | |
| reference_subject = random.choice(list(entities.keys())) | |
| elif nouns: | |
| reference_subject = random.choice(nouns) | |
| if not reference_subject: | |
| return paragraph | |
| # Choose a reference type | |
| ref_type = random.choice(["pronoun", "determiner", "repetition"]) | |
| # Apply the reference in a later sentence | |
| if ref_type == "pronoun": | |
| # Simple pronoun substitution (could be improved with gender recognition) | |
| pronouns = ["it", "they", "this", "these", "that", "those"] | |
| pronoun = random.choice(pronouns) | |
| # Find a sentence with the reference subject | |
| for i, sent in enumerate(sentences): | |
| if reference_subject in sent and i < len(sentences) - 1: | |
| # Replace in the next sentence if possible | |
| next_sent = sentences[i+1] | |
| if reference_subject in next_sent: | |
| sentences[i+1] = next_sent.replace(reference_subject, pronoun, 1) | |
| break | |
| elif ref_type == "determiner": | |
| # Add a determiner phrase | |
| determiners = ["this", "that", "these", "those", "the", "such a"] | |
| determiner = random.choice(determiners) | |
| # Find a sentence with the reference subject | |
| for i, sent in enumerate(sentences): | |
| if reference_subject in sent and i < len(sentences) - 1: | |
| # Add in the next sentence if possible | |
| sentences[i+1] = sentences[i+1].replace( | |
| reference_subject, | |
| f"{determiner} {reference_subject}", | |
| 1 | |
| ) | |
| break | |
| return " ".join(sentences) | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Initialize the humanizer | |
| humanizer = EnhancedTextHumanizer() | |
| # Test text | |
| original_text = """ | |
| Artificial intelligence has significantly impacted numerous industries. | |
| It has improved efficiency in manufacturing through automation. | |
| The healthcare sector has benefited from better diagnostic tools. | |
| Machine learning algorithms continue to advance and provide new solutions. | |
| Companies invest heavily in AI research and development. | |
| """ | |
| # Test different personality types | |
| for personality in ['casual', 'formal', 'academic', 'enthusiastic']: | |
| print(f"\n--- {personality.upper()} PERSONALITY ---") | |
| humanized = humanizer.humanize_text( | |
| original_text, | |
| intensity=0.7, | |
| personality=personality | |
| ) | |
| print(humanized) | |
| # Test with regional dialect | |
| print("\n--- REGIONAL DIALECT (US SOUTH) ---") | |
| humanized = humanizer.humanize_text( | |
| original_text, | |
| intensity=0.7, | |
| personality='casual', | |
| regional_dialect='us_south' | |
| ) | |
| print(humanized) | |
| # Test with emotional tone | |
| print("\n--- EMOTIONAL TONE (POSITIVE) ---") | |
| humanized = humanizer.humanize_text( | |
| original_text, | |
| intensity=0.7, | |
| personality='enthusiastic', | |
| emotional_tone='positive' | |
| ) | |
| print(humanized) | |