Spaces:
Sleeping
Sleeping
| import re | |
| import random | |
| import nltk | |
| from typing import List, Dict, Optional | |
| import numpy as np | |
| # Download required NLTK data | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt') | |
| try: | |
| nltk.data.find('corpora/wordnet') | |
| except LookupError: | |
| nltk.download('wordnet') | |
| try: | |
| nltk.data.find('corpora/omw-1.4') | |
| except LookupError: | |
| nltk.download('omw-1.4') | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from nltk.corpus import wordnet | |
| # Try to import optional dependencies with fallbacks | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| SENTENCE_TRANSFORMERS_AVAILABLE = True | |
| except ImportError as e: | |
| print(f"⚠️ Warning: sentence_transformers not available: {e}") | |
| print("💡 Falling back to basic similarity calculation") | |
| SENTENCE_TRANSFORMERS_AVAILABLE = False | |
| try: | |
| from transformers import pipeline | |
| TRANSFORMERS_AVAILABLE = True | |
| except ImportError as e: | |
| print(f"⚠️ Warning: transformers not available: {e}") | |
| print("💡 Paraphrasing will be disabled") | |
| TRANSFORMERS_AVAILABLE = False | |
| try: | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity | |
| SKLEARN_AVAILABLE = True | |
| except ImportError as e: | |
| print(f"⚠️ Warning: scikit-learn not available: {e}") | |
| print("💡 Using basic similarity calculation") | |
| SKLEARN_AVAILABLE = False | |
| class AITextHumanizer: | |
| def __init__(self): | |
| """Initialize the text humanizer with necessary models and data""" | |
| print("Loading AI Text Humanizer...") | |
| # Load sentence transformer for semantic similarity (optional) | |
| self.similarity_model = None | |
| if SENTENCE_TRANSFORMERS_AVAILABLE: | |
| try: | |
| print("📥 Loading sentence transformer...") | |
| self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| print("✅ Sentence transformer loaded") | |
| except Exception as e: | |
| print(f"⚠️ Warning: Could not load sentence transformer: {e}") | |
| self.similarity_model = None | |
| # Initialize paraphrasing pipeline (optional) | |
| self.paraphraser = None | |
| if TRANSFORMERS_AVAILABLE: | |
| try: | |
| print("📥 Loading paraphrasing model...") | |
| self.paraphraser = pipeline("text2text-generation", | |
| model="google/flan-t5-small", | |
| max_length=512) | |
| print("✅ Paraphrasing model loaded") | |
| except Exception as e: | |
| print(f"⚠️ Warning: Could not load paraphrasing model: {e}") | |
| self.paraphraser = None | |
| # Fallback TF-IDF vectorizer for similarity | |
| self.tfidf_vectorizer = None | |
| if SKLEARN_AVAILABLE: | |
| self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2)) | |
| # Formal to casual word mappings | |
| self.formal_to_casual = { | |
| "utilize": "use", | |
| "demonstrate": "show", | |
| "facilitate": "help", | |
| "implement": "do", | |
| "consequently": "so", | |
| "therefore": "so", | |
| "nevertheless": "but", | |
| "furthermore": "also", | |
| "moreover": "also", | |
| "subsequently": "then", | |
| "accordingly": "so", | |
| "regarding": "about", | |
| "concerning": "about", | |
| "pertaining": "about", | |
| "approximately": "about", | |
| "endeavor": "try", | |
| "commence": "start", | |
| "terminate": "end", | |
| "obtain": "get", | |
| "purchase": "buy", | |
| "examine": "look at", | |
| "analyze": "study", | |
| "construct": "build", | |
| "establish": "set up", | |
| "magnitude": "size", | |
| "comprehensive": "complete", | |
| "significant": "big", | |
| "substantial": "large", | |
| "optimal": "best", | |
| "sufficient": "enough", | |
| "prior to": "before", | |
| "in order to": "to", | |
| "due to the fact that": "because", | |
| "at this point in time": "now", | |
| "in the event that": "if", | |
| "it is important to note": "note that", | |
| "it should be emphasized": "remember", | |
| "it is worth mentioning": "by the way", | |
| "it is crucial to understand": "importantly", | |
| } | |
| # Contractions mapping | |
| self.contractions = { | |
| "do not": "don't", | |
| "does not": "doesn't", | |
| "did not": "didn't", | |
| "will not": "won't", | |
| "would not": "wouldn't", | |
| "should not": "shouldn't", | |
| "could not": "couldn't", | |
| "cannot": "can't", | |
| "is not": "isn't", | |
| "are not": "aren't", | |
| "was not": "wasn't", | |
| "were not": "weren't", | |
| "have not": "haven't", | |
| "has not": "hasn't", | |
| "had not": "hadn't", | |
| "I am": "I'm", | |
| "you are": "you're", | |
| "he is": "he's", | |
| "she is": "she's", | |
| "it is": "it's", | |
| "we are": "we're", | |
| "they are": "they're", | |
| "I have": "I've", | |
| "you have": "you've", | |
| "we have": "we've", | |
| "they have": "they've", | |
| "I will": "I'll", | |
| "you will": "you'll", | |
| "he will": "he'll", | |
| "she will": "she'll", | |
| "it will": "it'll", | |
| "we will": "we'll", | |
| "they will": "they'll", | |
| } | |
| # AI-like transition words | |
| self.ai_transition_words = [ | |
| "Furthermore,", "Moreover,", "Additionally,", "Subsequently,", | |
| "Consequently,", "Therefore,", "Nevertheless,", "However,", | |
| "In conclusion,", "To summarize,", "In summary,", "Overall,", | |
| "It is important to note that", "It should be emphasized that", | |
| "It is worth mentioning that", "It is crucial to understand that", | |
| "It is essential to recognize that", "It must be acknowledged that" | |
| ] | |
| # Natural alternatives | |
| self.natural_transitions = [ | |
| "Also,", "Plus,", "And,", "Then,", "So,", "But,", "Still,", | |
| "Anyway,", "By the way,", "Actually,", "Basically,", | |
| "Look,", "Listen,", "Here's the thing:", "The point is,", | |
| "What's more,", "On top of that,", "Another thing,", | |
| "Now,", "Well,", "You know,", "I mean,", "Honestly,", | |
| ] | |
| print("✅ AI Text Humanizer initialized successfully!") | |
| def add_contractions(self, text: str) -> str: | |
| """Add contractions to make text sound more natural""" | |
| for formal, casual in self.contractions.items(): | |
| # Case insensitive replacement but preserve original case | |
| pattern = re.compile(re.escape(formal), re.IGNORECASE) | |
| text = pattern.sub(casual, text) | |
| return text | |
| def replace_formal_words(self, text: str, replacement_rate: float = 0.7) -> str: | |
| """Replace formal words with casual alternatives""" | |
| # Handle both word-level and phrase-level replacements | |
| text_lower = text.lower() | |
| # First handle multi-word phrases | |
| for formal_phrase, casual_phrase in self.formal_to_casual.items(): | |
| if len(formal_phrase.split()) > 1: # Multi-word phrases | |
| pattern = re.compile(re.escape(formal_phrase), re.IGNORECASE) | |
| if random.random() < replacement_rate: | |
| text = pattern.sub(casual_phrase, text) | |
| # Then handle individual words | |
| words = word_tokenize(text) | |
| for i, word in enumerate(words): | |
| word_lower = word.lower() | |
| if word_lower in self.formal_to_casual and len(self.formal_to_casual[word_lower].split()) == 1: | |
| if random.random() < replacement_rate: | |
| # Preserve original case | |
| if word.isupper(): | |
| words[i] = self.formal_to_casual[word_lower].upper() | |
| elif word.istitle(): | |
| words[i] = self.formal_to_casual[word_lower].title() | |
| else: | |
| words[i] = self.formal_to_casual[word_lower] | |
| # Reconstruct text with proper spacing | |
| result = "" | |
| for i, word in enumerate(words): | |
| if i > 0 and word not in ".,!?;:": | |
| result += " " | |
| result += word | |
| return result | |
| def vary_sentence_structure(self, text: str) -> str: | |
| """Vary sentence structure to sound more natural""" | |
| sentences = sent_tokenize(text) | |
| varied_sentences = [] | |
| for sentence in sentences: | |
| # Sometimes start sentences with connecting words | |
| if random.random() < 0.3: | |
| connectors = ["Well,", "So,", "Now,", "Look,", "Actually,", "Basically,"] | |
| if not any(sentence.startswith(word) for word in connectors): | |
| sentence = random.choice(connectors) + " " + sentence.lower() | |
| # Occasionally break long sentences | |
| if len(sentence.split()) > 20 and random.random() < 0.4: | |
| words = sentence.split() | |
| mid_point = len(words) // 2 | |
| # Find a natural break point near the middle | |
| for i in range(max(0, mid_point - 3), min(mid_point + 3, len(words))): | |
| if words[i].rstrip('.,!?;:') in ['and', 'but', 'or', 'so', 'then']: | |
| sentence1 = ' '.join(words[:i+1]) | |
| sentence2 = ' '.join(words[i+1:]) | |
| if sentence2: | |
| sentence2 = sentence2[0].upper() + sentence2[1:] if len(sentence2) > 1 else sentence2.upper() | |
| varied_sentences.append(sentence1) | |
| sentence = sentence2 | |
| break | |
| varied_sentences.append(sentence) | |
| return ' '.join(varied_sentences) | |
| def replace_ai_transitions(self, text: str) -> str: | |
| """Replace AI-like transition words with natural alternatives""" | |
| for ai_word in self.ai_transition_words: | |
| if ai_word in text: | |
| natural_replacement = random.choice(self.natural_transitions) | |
| text = text.replace(ai_word, natural_replacement, 1) # Replace only first occurrence | |
| return text | |
| def add_natural_imperfections(self, text: str, imperfection_rate: float = 0.1) -> str: | |
| """Add subtle imperfections to make text more human-like""" | |
| sentences = sent_tokenize(text) | |
| modified_sentences = [] | |
| for sentence in sentences: | |
| # Occasionally start with lowercase after punctuation (casual style) | |
| if random.random() < imperfection_rate: | |
| words = sentence.split() | |
| if len(words) > 1 and words[0].lower() in ['and', 'but', 'or', 'so']: | |
| sentence = words[0].lower() + ' ' + ' '.join(words[1:]) | |
| # Sometimes use informal punctuation | |
| if random.random() < imperfection_rate: | |
| if sentence.endswith('.'): | |
| # Occasionally remove period for casual feel | |
| sentence = sentence[:-1] | |
| elif not sentence.endswith(('.', '!', '?')): | |
| if random.random() < 0.5: | |
| sentence += '.' | |
| modified_sentences.append(sentence) | |
| return ' '.join(modified_sentences) | |
| def paraphrase_segments(self, text: str, paraphrase_rate: float = 0.3) -> str: | |
| """Paraphrase some segments using the transformer model""" | |
| if not self.paraphraser: | |
| return text | |
| sentences = sent_tokenize(text) | |
| paraphrased_sentences = [] | |
| for sentence in sentences: | |
| if random.random() < paraphrase_rate and len(sentence.split()) > 8: | |
| try: | |
| # Create paraphrase prompt | |
| prompt = f"Rewrite this in a more natural, conversational way: {sentence}" | |
| result = self.paraphraser(prompt, max_length=150, num_return_sequences=1) | |
| paraphrased = result[0]['generated_text'] | |
| # Clean up the result | |
| paraphrased = paraphrased.replace(prompt, '').strip() | |
| # Remove quotes if added | |
| paraphrased = paraphrased.strip('"\'') | |
| if paraphrased and len(paraphrased) > 10 and len(paraphrased) < len(sentence) * 2: | |
| paraphrased_sentences.append(paraphrased) | |
| else: | |
| paraphrased_sentences.append(sentence) | |
| except Exception as e: | |
| print(f"⚠️ Paraphrasing failed for sentence: {e}") | |
| paraphrased_sentences.append(sentence) | |
| else: | |
| paraphrased_sentences.append(sentence) | |
| return ' '.join(paraphrased_sentences) | |
| def calculate_similarity_basic(self, text1: str, text2: str) -> float: | |
| """Basic similarity calculation using word overlap""" | |
| words1 = set(word_tokenize(text1.lower())) | |
| words2 = set(word_tokenize(text2.lower())) | |
| if not words1 or not words2: | |
| return 1.0 if text1 == text2 else 0.0 | |
| intersection = words1.intersection(words2) | |
| union = words1.union(words2) | |
| return len(intersection) / len(union) if union else 1.0 | |
| def calculate_similarity_tfidf(self, text1: str, text2: str) -> float: | |
| """Calculate similarity using TF-IDF vectors""" | |
| if not SKLEARN_AVAILABLE or not self.tfidf_vectorizer: | |
| return self.calculate_similarity_basic(text1, text2) | |
| try: | |
| tfidf_matrix = self.tfidf_vectorizer.fit_transform([text1, text2]) | |
| similarity = sklearn_cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] | |
| return float(similarity) | |
| except Exception as e: | |
| print(f"⚠️ TF-IDF similarity calculation failed: {e}") | |
| return self.calculate_similarity_basic(text1, text2) | |
| def calculate_similarity(self, text1: str, text2: str) -> float: | |
| """Calculate semantic similarity between original and humanized text""" | |
| if self.similarity_model: | |
| try: | |
| embeddings1 = self.similarity_model.encode([text1]) | |
| embeddings2 = self.similarity_model.encode([text2]) | |
| similarity = np.dot(embeddings1[0], embeddings2[0]) / ( | |
| np.linalg.norm(embeddings1[0]) * np.linalg.norm(embeddings2[0]) | |
| ) | |
| return float(similarity) | |
| except Exception as e: | |
| print(f"⚠️ Sentence transformer similarity failed: {e}") | |
| return self.calculate_similarity_tfidf(text1, text2) | |
| else: | |
| return self.calculate_similarity_tfidf(text1, text2) | |
| def humanize_text(self, | |
| text: str, | |
| style: str = "natural", | |
| intensity: float = 0.7) -> Dict: | |
| """ | |
| Main humanization function | |
| Args: | |
| text: Input text to humanize | |
| style: Style of humanization ('natural', 'casual', 'conversational') | |
| intensity: Intensity of humanization (0.0 to 1.0) | |
| Returns: | |
| Dictionary with humanized text and metadata | |
| """ | |
| if not text.strip(): | |
| return { | |
| "original_text": text, | |
| "humanized_text": text, | |
| "similarity_score": 1.0, | |
| "changes_made": [], | |
| "style": style, | |
| "intensity": intensity | |
| } | |
| changes_made = [] | |
| humanized_text = text | |
| original_text = text | |
| # Apply transformations based on intensity | |
| if intensity > 0.2: | |
| # Replace AI-like transitions first | |
| before_transitions = humanized_text | |
| humanized_text = self.replace_ai_transitions(humanized_text) | |
| if humanized_text != before_transitions: | |
| changes_made.append("Replaced AI-like transition words") | |
| if intensity > 0.3: | |
| # Replace formal words | |
| before_formal = humanized_text | |
| humanized_text = self.replace_formal_words(humanized_text, intensity * 0.8) | |
| if humanized_text != before_formal: | |
| changes_made.append("Replaced formal words with casual alternatives") | |
| if intensity > 0.4: | |
| # Add contractions | |
| before_contractions = humanized_text | |
| humanized_text = self.add_contractions(humanized_text) | |
| if humanized_text != before_contractions: | |
| changes_made.append("Added contractions") | |
| if intensity > 0.5: | |
| # Vary sentence structure | |
| before_structure = humanized_text | |
| humanized_text = self.vary_sentence_structure(humanized_text) | |
| if humanized_text != before_structure: | |
| changes_made.append("Varied sentence structure") | |
| if intensity > 0.6 and style in ["casual", "conversational"]: | |
| # Add natural imperfections | |
| before_imperfections = humanized_text | |
| humanized_text = self.add_natural_imperfections(humanized_text, intensity * 0.15) | |
| if humanized_text != before_imperfections: | |
| changes_made.append("Added natural imperfections") | |
| if intensity > 0.7 and self.paraphraser: | |
| # Paraphrase some segments | |
| before_paraphrase = humanized_text | |
| humanized_text = self.paraphrase_segments(humanized_text, intensity * 0.3) | |
| if humanized_text != before_paraphrase: | |
| changes_made.append("Paraphrased some segments") | |
| # Calculate similarity | |
| similarity_score = self.calculate_similarity(original_text, humanized_text) | |
| # Ensure similarity is reasonable (between 0.7-1.0 for good humanization) | |
| if similarity_score < 0.5: | |
| print(f"⚠️ Low similarity score ({similarity_score:.3f}), using original text") | |
| humanized_text = original_text | |
| similarity_score = 1.0 | |
| changes_made = ["Similarity too low, reverted to original"] | |
| return { | |
| "original_text": original_text, | |
| "humanized_text": humanized_text, | |
| "similarity_score": similarity_score, | |
| "changes_made": changes_made, | |
| "style": style, | |
| "intensity": intensity | |
| } | |
| # Test the humanizer | |
| if __name__ == "__main__": | |
| humanizer = AITextHumanizer() | |
| # Test text | |
| test_text = """ | |
| Furthermore, it is important to note that artificial intelligence systems demonstrate | |
| significant capabilities in natural language processing tasks. Subsequently, these | |
| systems can analyze and generate text with remarkable accuracy. Nevertheless, it is | |
| crucial to understand that human oversight remains essential for optimal performance. | |
| Therefore, organizations should implement comprehensive strategies to utilize these | |
| technologies effectively while maintaining quality standards. | |
| """ | |
| print("Original Text:") | |
| print(test_text.strip()) | |
| print("\n" + "="*50 + "\n") | |
| result = humanizer.humanize_text(test_text.strip(), style="conversational", intensity=0.8) | |
| print("Humanized Text:") | |
| print(result["humanized_text"]) | |
| print(f"\nSimilarity Score: {result['similarity_score']:.3f}") | |
| print(f"Changes Made: {', '.join(result['changes_made']) if result['changes_made'] else 'None'}") | |
| print(f"\nModel Status:") | |
| print(f"- Sentence Transformers: {'✅ Available' if SENTENCE_TRANSFORMERS_AVAILABLE else '❌ Not available'}") | |
| print(f"- Transformers: {'✅ Available' if TRANSFORMERS_AVAILABLE else '❌ Not available'}") | |
| print(f"- Scikit-learn: {'✅ Available' if SKLEARN_AVAILABLE else '❌ Not available'}") |