Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM | |
| from bs4 import BeautifulSoup, NavigableString, Tag | |
| import re | |
| import time | |
| import random | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| # Download required NLTK data | |
| try: | |
| nltk.download('punkt', quiet=True) | |
| except: | |
| pass | |
| # Try to import spaCy but make it optional | |
| try: | |
| import spacy | |
| SPACY_AVAILABLE = True | |
| except: | |
| print("spaCy not available, using NLTK for sentence processing") | |
| SPACY_AVAILABLE = False | |
| class HumanLikeVariations: | |
| """Add human-like variations and intentional imperfections""" | |
| def __init__(self): | |
| # Common human writing patterns - EXPANDED for Originality AI | |
| self.casual_transitions = [ | |
| "So, ", "Well, ", "Now, ", "Actually, ", "Basically, ", | |
| "You know, ", "I mean, ", "Thing is, ", "Honestly, ", | |
| "Look, ", "Listen, ", "See, ", "Okay, ", "Right, ", | |
| "Anyway, ", "Besides, ", "Plus, ", "Also, ", "Oh, ", | |
| "Hey, ", "Alright, ", "Sure, ", "Fine, ", "Obviously, ", | |
| "Clearly, ", "Seriously, ", "Literally, ", "Frankly, ", | |
| "To be honest, ", "Truth is, ", "In fact, ", "Believe it or not, ", | |
| "Here's the thing, ", "Let me tell you, ", "Get this, ", | |
| "Funny thing is, ", "Interestingly, ", "Surprisingly, ", | |
| "Let's be real here, ", "Can we talk about ", "Quick question: ", | |
| "Real talk: ", "Hot take: ", "Unpopular opinion: ", "Fun fact: ", | |
| "Pro tip: ", "Side note: ", "Random thought: ", "Food for thought: ", | |
| "Just saying, ", "Not gonna lie, ", "For what it's worth, ", | |
| "If you ask me, ", "Between you and me, ", "Here's my take: ", | |
| "Let's face it, ", "No kidding, ", "Seriously though, ", | |
| "But wait, ", "Hold on, ", "Check this out: ", "Guess what? " | |
| ] | |
| self.filler_phrases = [ | |
| "kind of", "sort of", "pretty much", "basically", "actually", | |
| "really", "just", "quite", "rather", "fairly", "totally", | |
| "definitely", "probably", "maybe", "perhaps", "somehow", | |
| "somewhat", "literally", "seriously", "honestly", "frankly", | |
| "simply", "merely", "purely", "truly", "genuinely", | |
| "absolutely", "completely", "entirely", "utterly", "practically", | |
| "virtually", "essentially", "fundamentally", "generally", "typically", | |
| "usually", "normally", "often", "sometimes", "occasionally", | |
| "apparently", "evidently", "obviously", "clearly", "seemingly", | |
| "arguably", "potentially", "possibly", "likely", "unlikely", | |
| "more or less", "give or take", "so to speak", "if you will", | |
| "per se", "as such", "in a way", "to some extent", "to a degree", | |
| "I kid you not", "no joke", "for real", "not gonna lie", | |
| "I'm telling you", "trust me", "believe me", "I swear", | |
| "hands down", "without a doubt", "100%", "straight up", | |
| "I think", "I feel like", "I guess", "I suppose", "seems like", | |
| "appears to be", "might be", "could be", "tends to", "tends to be", | |
| "in my experience", "from what I've seen", "as far as I know", | |
| "to the best of my knowledge", "if I'm not mistaken", "correct me if I'm wrong", | |
| "you know what", "here's the deal", "bottom line", "at any rate", | |
| "all in all", "when you think about it", "come to think of it", | |
| "now that I think about it", "if we're being honest", "to be fair" | |
| ] | |
| self.human_connectors = [ | |
| ", which means", ", so", ", because", ", since", ", although", | |
| ". That's why", ". This means", ". So basically,", ". The thing is,", | |
| ", and honestly", ", but here's the thing", ", though", ", however", | |
| ". Plus,", ". Also,", ". Besides,", ". Moreover,", ". Furthermore,", | |
| ", which is why", ", and that's because", ", given that", ", considering", | |
| ". In other words,", ". Put simply,", ". To clarify,", ". That said,", | |
| ", you see", ", you know", ", right?", ", okay?", ", yeah?", | |
| ". Here's why:", ". Let me explain:", ". Think about it:", | |
| ", if you ask me", ", in my opinion", ", from my perspective", | |
| ". On the flip side,", ". On the other hand,", ". Conversely,", | |
| ", not to mention", ", let alone", ", much less", ", aside from", | |
| ". What's more,", ". Even better,", ". Even worse,", ". The catch is,", | |
| ", believe it or not", ", surprisingly enough", ", interestingly enough", | |
| ". Long story short,", ". Bottom line is,", ". Point being,", | |
| ", as you might expect", ", as it turns out", ", as luck would have it", | |
| ". And get this:", ". But wait, there's more:", ". Here's the kicker:", | |
| ", and here's why", ", and here's the thing", ", but here's what happened", | |
| ". Spoiler alert:", ". Plot twist:", ". Reality check:", | |
| ", at the end of the day", ", when all is said and done", ", all things considered", | |
| ". Make no mistake,", ". Don't get me wrong,", ". Let's not forget,", | |
| ", between you and me", ", off the record", ", just between us", | |
| ". And honestly?", ". But seriously,", ". And you know what?", | |
| ", which brings me to", ". This reminds me of", ", speaking of which", | |
| ". Funny enough,", ". Weird thing is,", ". Strange but true:", | |
| ", and I mean", ". I'm not kidding when I say", ", and trust me on this" | |
| ] | |
| # NEW: Common human typos and variations | |
| self.common_typos = { | |
| "the": ["teh", "th", "hte"], | |
| "and": ["adn", "nad", "an"], | |
| "that": ["taht", "htat", "tha"], | |
| "with": ["wiht", "wtih", "iwth"], | |
| "have": ["ahve", "hvae", "hav"], | |
| "from": ["form", "fro", "frmo"], | |
| "they": ["tehy", "thye", "htey"], | |
| "which": ["whihc", "wich", "whcih"], | |
| "their": ["thier", "theri", "tehir"], | |
| "would": ["woudl", "wuold", "woul"], | |
| "there": ["tehre", "theer", "ther"], | |
| "could": ["coudl", "cuold", "coud"], | |
| "people": ["poeple", "peopel", "pepole"], | |
| "through": ["thorugh", "throught", "trhough"], | |
| "because": ["becuase", "becasue", "beacuse"], | |
| "before": ["beofre", "befroe", "befor"], | |
| "different": ["differnt", "differnet", "diferent"], | |
| "between": ["bewteen", "betwen", "betewen"], | |
| "important": ["improtant", "importnat", "importan"], | |
| "information": ["infromation", "informaiton", "informaton"] | |
| } | |
| # NEW: Human-like sentence starters for variety | |
| self.varied_starters = [ | |
| "When it comes to", "As for", "Regarding", "In terms of", | |
| "With respect to", "Concerning", "Speaking of", "About", | |
| "If we look at", "Looking at", "Considering", "Given", | |
| "Taking into account", "Bear in mind that", "Keep in mind", | |
| "It's worth noting that", "It should be noted that", | |
| "One thing to consider is", "An important point is", | |
| "What's interesting is", "What stands out is", | |
| "The key here is", "The main thing is", "The point is", | |
| "Here's what matters:", "Here's the deal:", "Here's something:", | |
| "Let's not forget", "We should remember", "Don't forget", | |
| "Think about it this way:", "Look at it like this:", | |
| "Consider this:", "Picture this:", "Imagine this:", | |
| "You might wonder", "You might ask", "You may think", | |
| "Some people say", "Many believe", "It's often said", | |
| "Research shows", "Studies indicate", "Evidence suggests", | |
| "Experience tells us", "History shows", "Time has shown" | |
| ] | |
| def add_human_touch(self, text): | |
| """Add subtle human-like imperfections - MORE CONTEXT-AWARE""" | |
| sentences = text.split('. ') | |
| modified_sentences = [] | |
| for i, sent in enumerate(sentences): | |
| if not sent.strip(): | |
| continue | |
| # Parse sentence structure for better filler placement | |
| words = sent.split() | |
| if not words: | |
| continue | |
| # Occasionally start with casual transition (25% chance) | |
| if i > 0 and random.random() < 0.25 and len(words) > 5: | |
| # Choose transitions based on sentence type | |
| if any(q in sent.lower() for q in ['why', 'how', 'what', 'when', 'where']): | |
| # Question-appropriate transitions | |
| transition = random.choice(["So, ", "Well, ", "Now, ", "Okay, ", "Right, "]) | |
| elif any(w in sent.lower() for w in ['however', 'but', 'although', 'despite']): | |
| # Contrast-appropriate transitions | |
| transition = random.choice(["Still, ", "Yet, ", "Even so, ", "That said, ", "Nonetheless, "]) | |
| else: | |
| # General transitions | |
| transition = random.choice(self.casual_transitions[:20]) # Use more common ones | |
| sent = transition + sent[0].lower() + sent[1:] if len(sent) > 1 else sent | |
| # Add filler words occasionally (20% chance) - SMARTER PLACEMENT | |
| if random.random() < 0.2 and len(words) > 8: | |
| # Find good positions for fillers (after verbs, before adjectives, etc.) | |
| good_positions = [] | |
| for idx, word in enumerate(words): | |
| if idx > 0 and idx < len(words) - 1: | |
| # After "is/are/was/were" (good for "really", "actually", etc.) | |
| if word.lower() in ['is', 'are', 'was', 'were', 'been', 'be']: | |
| good_positions.append(idx + 1) | |
| # Before adjectives (good for "quite", "rather", etc.) | |
| elif words[idx-1].lower() in ['a', 'an', 'the', 'very', 'so']: | |
| good_positions.append(idx) | |
| # After "can/could/will/would" (good for "probably", "definitely", etc.) | |
| elif word.lower() in ['can', 'could', 'will', 'would', 'should', 'might', 'may']: | |
| good_positions.append(idx + 1) | |
| if good_positions: | |
| insert_pos = random.choice(good_positions) | |
| # Choose appropriate filler based on context | |
| if words[insert_pos-1].lower() in ['is', 'are', 'was', 'were']: | |
| filler = random.choice(['really', 'actually', 'definitely', 'certainly', 'quite']) | |
| elif words[insert_pos-1].lower() in ['can', 'could', 'will', 'would']: | |
| filler = random.choice(['probably', 'definitely', 'certainly', 'likely', 'possibly']) | |
| else: | |
| filler = random.choice(['quite', 'rather', 'pretty', 'fairly', 'somewhat']) | |
| words.insert(insert_pos, filler) | |
| sent = ' '.join(words) | |
| # Add varied sentence starters (15% chance) - MORE LOGICAL | |
| if i > 0 and random.random() < 0.15 and len(words) > 10: | |
| # Choose starters based on sentence content | |
| if any(w in sent.lower() for w in ['research', 'study', 'data', 'evidence']): | |
| starter = random.choice(["Research shows", "Studies indicate", "Evidence suggests", "Data reveals"]) | |
| elif any(w in sent.lower() for w in ['important', 'crucial', 'vital', 'essential']): | |
| starter = random.choice(["It's worth noting that", "Keep in mind", "Bear in mind that", "The key here is"]) | |
| else: | |
| starter = random.choice(["When it comes to", "As for", "Regarding", "In terms of"]) | |
| sent = starter + " " + sent[0].lower() + sent[1:] if len(sent) > 1 else sent | |
| # Occasionally use contractions (35% chance) | |
| if random.random() < 0.35: | |
| sent = self.apply_contractions(sent) | |
| # Add occasional comma splices (10% chance) - ONLY WHERE IT MAKES SENSE | |
| if random.random() < 0.1 and ',' in sent and len(words) > 10: | |
| # Only do this with independent clauses | |
| parts = sent.split(', ') | |
| if len(parts) == 2: | |
| # Check if both parts could be sentences | |
| if (len(parts[0].split()) > 4 and len(parts[1].split()) > 4 and | |
| any(v in parts[1].lower().split()[:3] for v in ['it', 'this', 'that', 'they', 'we', 'i', 'you'])): | |
| sent = parts[0] + ', ' + parts[1] # Keep the comma splice | |
| # NEW: Add parenthetical thoughts (8% chance) - CONTEXT-AWARE | |
| if random.random() < 0.08 and len(words) > 15: | |
| # Find natural break points (after complete thoughts) | |
| break_points = [] | |
| for idx, word in enumerate(words): | |
| if idx > len(words)//3 and idx < 2*len(words)//3: | |
| if word.endswith(',') or words[idx-1].lower() in ['is', 'are', 'was', 'were']: | |
| break_points.append(idx) | |
| if break_points: | |
| insert_pos = random.choice(break_points) | |
| # Choose relevant parenthetical | |
| if any(w in sent.lower() for w in ['surprising', 'interesting', 'amazing']): | |
| parenthetical = random.choice(["(and that's saying something)", "(believe it or not)", "(surprisingly enough)"]) | |
| elif any(w in sent.lower() for w in ['obvious', 'clear', 'evident']): | |
| parenthetical = random.choice(["(obviously)", "(clearly)", "(of course)"]) | |
| else: | |
| parenthetical = random.choice(["(which makes sense)", "(for good reason)", "(as you'd expect)"]) | |
| words.insert(insert_pos, parenthetical) | |
| sent = ' '.join(words) | |
| # NEW: Occasionally add rhetorical questions (5% chance) - ONLY AT PARAGRAPH ENDS | |
| if random.random() < 0.05 and i == len(sentences) - 1: | |
| # Choose question based on sentence content | |
| if any(w in sent.lower() for w in ['amazing', 'incredible', 'fantastic']): | |
| question = random.choice(["Pretty cool, right?", "Amazing, isn't it?", "Impressive, huh?"]) | |
| elif any(w in sent.lower() for w in ['important', 'crucial', 'essential']): | |
| question = random.choice(["Makes sense, right?", "See what I mean?", "Important to remember, yeah?"]) | |
| else: | |
| question = random.choice(["Interesting, right?", "Makes you think, doesn't it?", "Sound familiar?"]) | |
| sent = sent + " " + question | |
| modified_sentences.append(sent) | |
| return '. '.join(modified_sentences) | |
| def apply_contractions(self, text): | |
| """Apply common contractions - EXPANDED""" | |
| contractions = { | |
| "it is": "it's", "that is": "that's", "there is": "there's", | |
| "he is": "he's", "she is": "she's", "what is": "what's", | |
| "where is": "where's", "who is": "who's", "how is": "how's", | |
| "cannot": "can't", "will not": "won't", "do not": "don't", | |
| "does not": "doesn't", "did not": "didn't", "could not": "couldn't", | |
| "should not": "shouldn't", "would not": "wouldn't", "is not": "isn't", | |
| "are not": "aren't", "was not": "wasn't", "were not": "weren't", | |
| "have not": "haven't", "has not": "hasn't", "had not": "hadn't", | |
| "I am": "I'm", "you are": "you're", "we are": "we're", | |
| "they are": "they're", "I have": "I've", "you have": "you've", | |
| "we have": "we've", "they have": "they've", "I will": "I'll", | |
| "you will": "you'll", "he will": "he'll", "she will": "she'll", | |
| "we will": "we'll", "they will": "they'll", "I would": "I'd", | |
| "you would": "you'd", "he would": "he'd", "she would": "she'd", | |
| "we would": "we'd", "they would": "they'd", "could have": "could've", | |
| "should have": "should've", "would have": "would've", "might have": "might've", | |
| "must have": "must've", "there has": "there's", "here is": "here's", | |
| "let us": "let's", "that will": "that'll", "who will": "who'll" | |
| } | |
| for full, contr in contractions.items(): | |
| if random.random() < 0.8: # 80% chance to apply each contraction | |
| text = re.sub(r'\b' + full + r'\b', contr, text, flags=re.IGNORECASE) | |
| return text | |
| def add_minor_errors(self, text): | |
| """Add very minor, human-like errors - MORE REALISTIC BUT CONTROLLED""" | |
| # Occasionally miss Oxford comma (15% chance) | |
| if random.random() < 0.15: | |
| # Only in lists, not random commas | |
| text = re.sub(r'(\w+), (\w+), and (\w+)', r'\1, \2 and \3', text) | |
| # Sometimes use 'which' instead of 'that' (8% chance) | |
| if random.random() < 0.08: | |
| # Only for non-restrictive clauses | |
| matches = re.finditer(r'\b(\w+) that (\w+)', text) | |
| for match in list(matches)[:1]: # Only first occurrence | |
| if match.group(1).lower() not in ['believe', 'think', 'know', 'say']: | |
| text = text.replace(match.group(0), f"{match.group(1)} which {match.group(2)}", 1) | |
| # NEW: Add very occasional typos (2% chance per sentence) - REDUCED AND CONTROLLED | |
| sentences = text.split('. ') | |
| for i, sent in enumerate(sentences): | |
| if random.random() < 0.02 and len(sent.split()) > 15: # Only in longer sentences | |
| words = sent.split() | |
| # Pick a random word to potentially typo | |
| word_idx = random.randint(len(words)//2, len(words)-2) # Avoid start/end | |
| word = words[word_idx].lower() | |
| # Only typo common words where typo won't break meaning | |
| safe_typos = { | |
| 'the': 'teh', | |
| 'and': 'adn', | |
| 'that': 'taht', | |
| 'with': 'wtih', | |
| 'from': 'form', | |
| 'because': 'becuase' | |
| } | |
| if word in safe_typos and random.random() < 0.5: | |
| typo = safe_typos[word] | |
| # Preserve original capitalization | |
| if words[word_idx][0].isupper(): | |
| typo = typo[0].upper() + typo[1:] | |
| words[word_idx] = typo | |
| sentences[i] = ' '.join(words) | |
| text = '. '.join(sentences) | |
| # Skip double words - too distracting | |
| # Mix up common homophones occasionally (2% chance) - ONLY SAFE ONES | |
| if random.random() < 0.02: | |
| safe_homophones = [ | |
| ('its', "it's"), # Very common mistake | |
| ('your', "you're"), # Another common one | |
| ] | |
| for pair in safe_homophones: | |
| # Check context to avoid breaking meaning | |
| if f" {pair[0]} " in text and random.random() < 0.3: | |
| # Find one instance and check it's safe to replace | |
| pattern = rf'\b{pair[0]}\s+(\w+ing|\w+ed)\b' # its + verb = likely should be it's | |
| if re.search(pattern, text): | |
| text = re.sub(pattern, f"{pair[1]} \\1", text, count=1) | |
| break | |
| return text | |
| def add_originality_specific_patterns(self, text): | |
| """Add patterns that Originality AI associates with human writing""" | |
| # 1. Add personal touches and opinions | |
| if random.random() < 0.1: | |
| personal_phrases = [ | |
| "In my view, ", "From my perspective, ", "I believe ", | |
| "It seems to me that ", "I've found that ", "In my experience, ", | |
| "I tend to think ", "My take is that ", "I'd argue that ", | |
| "Personally, I think ", "If you ask me, ", "The way I see it, " | |
| ] | |
| sentences = text.split('. ') | |
| if len(sentences) > 3: | |
| idx = random.randint(1, len(sentences)-2) | |
| sentences[idx] = random.choice(personal_phrases) + sentences[idx][0].lower() + sentences[idx][1:] | |
| text = '. '.join(sentences) | |
| # 2. Add conversational asides | |
| if random.random() < 0.08: | |
| asides = [ | |
| " - and this is important - ", | |
| " - bear with me here - ", | |
| " - stay with me - ", | |
| " - and I mean this - ", | |
| " - no exaggeration - ", | |
| " - true story - ", | |
| " - I'm serious - ", | |
| " - think about it - ", | |
| " - and here's why - " | |
| ] | |
| words = text.split() | |
| if len(words) > 20: | |
| pos = random.randint(10, len(words)-10) | |
| words.insert(pos, random.choice(asides)) | |
| text = ' '.join(words) | |
| # 3. Add emphatic repetition (human pattern) | |
| if random.random() < 0.05: | |
| emphatic_words = ['very', 'really', 'truly', 'absolutely', 'totally'] | |
| sentences = text.split('. ') | |
| if sentences: | |
| sent_idx = random.randint(0, len(sentences)-1) | |
| words = sentences[sent_idx].split() | |
| if len(words) > 5: | |
| # Find an adjective or adverb to emphasize | |
| for i, word in enumerate(words): | |
| if i > 0 and i < len(words)-1: | |
| # Add emphasis | |
| if random.random() < 0.3: | |
| emphasis = random.choice(emphatic_words) | |
| words.insert(i, emphasis) | |
| # Sometimes repeat for extra emphasis | |
| if random.random() < 0.3: | |
| words.insert(i, emphasis + ',') | |
| break | |
| sentences[sent_idx] = ' '.join(words) | |
| text = '. '.join(sentences) | |
| return text | |
| class SelectiveGrammarFixer: | |
| """Minimal grammar fixes to maintain human-like quality while fixing critical errors""" | |
| def __init__(self): | |
| self.nlp = None | |
| self.human_variations = HumanLikeVariations() | |
| def fix_incomplete_sentences_only(self, text): | |
| """Fix only incomplete sentences without over-correcting""" | |
| if not text: | |
| return text | |
| sentences = text.split('. ') | |
| fixed_sentences = [] | |
| for i, sent in enumerate(sentences): | |
| sent = sent.strip() | |
| if not sent: | |
| continue | |
| # Only fix if sentence is incomplete | |
| if sent and sent[-1] not in '.!?': | |
| # Check if it's the last sentence | |
| if i == len(sentences) - 1: | |
| # Add period if it's clearly a statement | |
| if not sent.endswith(':') and not sent.endswith(','): | |
| sent += '.' | |
| else: | |
| # Middle sentences should have periods | |
| sent += '.' | |
| # Fix cut-off words (very short last word without punctuation) | |
| words = sent.split() | |
| if len(words) > 3: | |
| last_word = words[-1].rstrip('.!?') | |
| if len(last_word) <= 2 and last_word.isalpha(): | |
| # Check if it has vowels (real word vs cut-off) | |
| if not any(c in 'aeiouAEIOU' for c in last_word): | |
| # Likely a cut-off word, remove it | |
| words = words[:-1] | |
| sent = ' '.join(words) | |
| if sent and sent[-1] not in '.!?': | |
| sent += '.' | |
| # Ensure first letter capitalization ONLY after sentence endings | |
| if i > 0 and sent and sent[0].islower(): | |
| # Check if previous sentence ended with punctuation | |
| if fixed_sentences and fixed_sentences[-1].rstrip().endswith(('.', '!', '?')): | |
| sent = sent[0].upper() + sent[1:] | |
| elif i == 0 and sent and sent[0].islower(): | |
| # First sentence should be capitalized | |
| sent = sent[0].upper() + sent[1:] | |
| fixed_sentences.append(sent) | |
| result = ' '.join(fixed_sentences) | |
| # Add human-like variations | |
| result = self.human_variations.add_human_touch(result) | |
| result = self.human_variations.add_minor_errors(result) | |
| result = self.human_variations.add_originality_specific_patterns(result) | |
| return result | |
| def fix_basic_punctuation_errors(self, text): | |
| """Fix only the most egregious punctuation errors""" | |
| if not text: | |
| return text | |
| # Fix double spaces (human-like error) | |
| text = re.sub(r'\s{2,}', ' ', text) | |
| # Fix space before punctuation (common error) | |
| text = re.sub(r'\s+([.,!?;:])', r'\1', text) | |
| # Fix missing space after punctuation (human-like) | |
| text = re.sub(r'([.,!?])([A-Z])', r'\1 \2', text) | |
| # Fix accidental double punctuation | |
| text = re.sub(r'([.!?])\1+', r'\1', text) | |
| # Fix "i" capitalization (common human error to fix) | |
| text = re.sub(r'\bi\b', 'I', text) | |
| return text | |
| def preserve_natural_variations(self, text): | |
| """Keep some natural human-like variations""" | |
| # Don't fix everything - leave some variety | |
| # Only fix if really broken | |
| if text.count('.') == 0 and len(text.split()) > 20: | |
| # Long text with no periods - needs fixing | |
| words = text.split() | |
| # Add periods every 15-25 words naturally (more variation) | |
| new_text = [] | |
| for i, word in enumerate(words): | |
| new_text.append(word) | |
| if i > 0 and i % random.randint(12, 25) == 0: | |
| if word[-1] not in '.!?,;:': | |
| new_text[-1] = word + '.' | |
| # Capitalize next word if it's not an acronym | |
| if i + 1 < len(words) and words[i + 1][0].islower(): | |
| # Check if it's not likely an acronym | |
| if not words[i + 1].isupper(): | |
| words[i + 1] = words[i + 1][0].upper() + words[i + 1][1:] | |
| text = ' '.join(new_text) | |
| return text | |
| def smart_fix(self, text): | |
| """Apply minimal fixes to maintain human-like quality""" | |
| # Apply fixes in order of importance | |
| text = self.fix_basic_punctuation_errors(text) | |
| text = self.fix_incomplete_sentences_only(text) | |
| text = self.preserve_natural_variations(text) | |
| return text | |
| class EnhancedDipperHumanizer: | |
| def __init__(self): | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Using device: {self.device}") | |
| # Clear GPU cache | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| # Initialize grammar fixer | |
| self.grammar_fixer = SelectiveGrammarFixer() | |
| # Try to load spaCy if available | |
| self.nlp = None | |
| self.use_spacy = False | |
| if SPACY_AVAILABLE: | |
| try: | |
| self.nlp = spacy.load("en_core_web_sm") | |
| self.use_spacy = True | |
| print("spaCy loaded successfully") | |
| except: | |
| print("spaCy model not found, using NLTK for sentence splitting") | |
| try: | |
| # Load Dipper paraphraser WITHOUT 8-bit quantization for better performance | |
| print("Loading Dipper paraphraser model...") | |
| self.tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl') | |
| self.model = T5ForConditionalGeneration.from_pretrained( | |
| "kalpeshk2011/dipper-paraphraser-xxl", | |
| device_map="auto", # This will distribute across 4xL40S automatically | |
| torch_dtype=torch.float16, | |
| low_cpu_mem_usage=True | |
| ) | |
| print("Dipper model loaded successfully!") | |
| self.is_dipper = True | |
| except Exception as e: | |
| print(f"Error loading Dipper model: {str(e)}") | |
| print("Falling back to Flan-T5-XL...") | |
| self.is_dipper = False | |
| # Fallback to Flan-T5-XL | |
| try: | |
| self.model = T5ForConditionalGeneration.from_pretrained( | |
| "google/flan-t5-xl", | |
| torch_dtype=torch.float16, | |
| low_cpu_mem_usage=True, | |
| device_map="auto" | |
| ) | |
| self.tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl") | |
| print("Loaded Flan-T5-XL as fallback") | |
| except: | |
| raise Exception("Could not load any model. Please check your system resources.") | |
| # Load BART as secondary model | |
| try: | |
| print("Loading BART model for additional variation...") | |
| self.bart_model = AutoModelForSeq2SeqLM.from_pretrained( | |
| "eugenesiow/bart-paraphrase", | |
| torch_dtype=torch.float16, | |
| device_map="auto" # Distribute across GPUs | |
| ) | |
| self.bart_tokenizer = AutoTokenizer.from_pretrained("eugenesiow/bart-paraphrase") | |
| self.use_bart = True | |
| print("BART model loaded successfully") | |
| except: | |
| print("BART model not available") | |
| self.use_bart = False | |
| def preserve_keywords(self, text, keywords): | |
| """Mark keywords to preserve them during paraphrasing""" | |
| if not keywords: | |
| return text, {} | |
| # Create a mapping of placeholders to keywords | |
| keyword_map = {} | |
| modified_text = text | |
| # Sort keywords by length (longest first) to avoid partial replacements | |
| sorted_keywords = sorted(keywords, key=len, reverse=True) | |
| for i, keyword in enumerate(sorted_keywords): | |
| # Use unique markers that won't be confused | |
| placeholder = f"__KW{i:03d}__" # e.g., __KW001__ | |
| # Find all occurrences of the keyword (case-insensitive) | |
| pattern = r'\b' + re.escape(keyword) + r'\b' | |
| matches = list(re.finditer(pattern, modified_text, flags=re.IGNORECASE)) | |
| if matches: | |
| # Replace all occurrences with the placeholder | |
| for match in reversed(matches): # Reverse to maintain positions | |
| original_keyword = match.group(0) | |
| start, end = match.span() | |
| modified_text = modified_text[:start] + placeholder + modified_text[end:] | |
| # Store the original case version | |
| keyword_map[placeholder] = original_keyword | |
| return modified_text, keyword_map | |
| def restore_keywords_robust(self, text, keyword_map): | |
| """Restore keywords with more flexible pattern matching""" | |
| if not keyword_map: | |
| return text | |
| restored_text = text | |
| # Debug: print what we're working with | |
| print(f"Restoring keywords in text: {restored_text[:100]}...") | |
| print(f"Keyword map: {keyword_map}") | |
| # Track which positions have been replaced to avoid double replacement | |
| replaced_positions = set() | |
| # First pass: Direct placeholder replacement | |
| for placeholder, keyword in keyword_map.items(): | |
| if placeholder in restored_text: | |
| print(f"Found exact placeholder {placeholder}, replacing with {keyword}") | |
| restored_text = restored_text.replace(placeholder, keyword) | |
| # Mark positions as replaced | |
| for match in re.finditer(re.escape(keyword), restored_text): | |
| replaced_positions.update(range(match.start(), match.end())) | |
| # Second pass: Handle any mangled placeholders | |
| # The model might alter placeholders in various ways | |
| for placeholder, keyword in keyword_map.items(): | |
| # Extract the number from placeholder | |
| match = re.search(r'__KW(\d+)__', placeholder) | |
| if match: | |
| num = match.group(1) | |
| # Various patterns the model might create | |
| patterns = [ | |
| (f'__KW{num}__', keyword), | |
| (f'__ KW{num}__', keyword), | |
| (f'__KW {num}__', keyword), | |
| (f'__ KW {num} __', keyword), | |
| (f'_KW{num}_', keyword), | |
| (f'_kw{num}_', keyword), | |
| (f'KW{num}', keyword), | |
| (f'KW {num}', keyword), | |
| (f'__kw{num}__', keyword), | |
| (f'__Kw{num}__', keyword), | |
| (f'__ kw{num}__', keyword), | |
| (f'__KW{num}_', keyword), | |
| (f'_KW{num}__', keyword), | |
| (f'kw{num}', keyword), | |
| (f'``KW{num}__', keyword), # Handle backtick corruption | |
| (f'``KKW{num}', keyword), # Handle double K corruption | |
| (f'KW{num}', keyword), # Simple pattern | |
| ] | |
| for pattern, replacement in patterns: | |
| if pattern in restored_text: | |
| # Check if this position has already been replaced | |
| start_pos = restored_text.find(pattern) | |
| if start_pos != -1 and not any(pos in replaced_positions for pos in range(start_pos, start_pos + len(pattern))): | |
| print(f"Found pattern '{pattern}', replacing with {replacement}") | |
| restored_text = restored_text.replace(pattern, replacement, 1) # Replace only first occurrence | |
| # Mark new positions as replaced | |
| for match in re.finditer(re.escape(replacement), restored_text): | |
| replaced_positions.update(range(match.start(), match.end())) | |
| break # Move to next placeholder after successful replacement | |
| # Third pass: Clean up any backticks or quotes that shouldn't be there | |
| # Remove double backticks | |
| restored_text = re.sub(r'``+', '', restored_text) | |
| # Fix double quotes | |
| restored_text = re.sub(r"''", '"', restored_text) | |
| restored_text = re.sub(r'""', '"', restored_text) | |
| # Fourth pass: Look for remaining underscore patterns | |
| # But be more careful about replacement | |
| if '___' in restored_text and keyword_map: | |
| # Find all occurrences of multiple underscores | |
| underscore_matches = list(re.finditer(r'_{3,}', restored_text)) | |
| keyword_values = list(keyword_map.values()) | |
| # Replace underscores with keywords, but only if not already replaced | |
| for i, match in enumerate(underscore_matches): | |
| if i < len(keyword_values): | |
| start, end = match.span() | |
| if not any(pos in replaced_positions for pos in range(start, end)): | |
| before = restored_text[:start] | |
| after = restored_text[end:] | |
| restored_text = before + keyword_values[i] + after | |
| # Update replaced positions | |
| replaced_positions.update(range(start, start + len(keyword_values[i]))) | |
| # Final cleanup: Remove any remaining KW patterns that weren't caught | |
| # But only if they're not part of an already replaced keyword | |
| remaining_kw_patterns = re.findall(r'\bKW\d{3}\b', restored_text) | |
| if remaining_kw_patterns: | |
| print(f"Warning: Found remaining KW patterns: {remaining_kw_patterns}") | |
| # Log final result | |
| print(f"Final restored text: {restored_text[:100]}...") | |
| return restored_text | |
| def should_skip_element(self, element, text): | |
| """Determine if an element should be skipped from paraphrasing""" | |
| if not text or len(text.strip()) < 3: | |
| return True | |
| # Skip JavaScript code inside script tags | |
| parent = element.parent | |
| if parent and parent.name in ['script', 'style', 'noscript']: | |
| return True | |
| # Skip headings (h1-h6) | |
| if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']: | |
| return True | |
| # Skip content inside <strong> and <b> tags | |
| if parent and parent.name in ['strong', 'b']: | |
| return True | |
| # Skip table content | |
| if parent and (parent.name in ['td', 'th'] or any(p.name == 'table' for p in parent.parents)): | |
| return True | |
| # Special handling for content inside tables | |
| # Skip if it's inside strong/b/h1-h6 tags AND also inside a table | |
| if parent: | |
| # Check if we're inside a table | |
| is_in_table = any(p.name == 'table' for p in parent.parents) | |
| if is_in_table: | |
| # If we're in a table, skip any text that's inside formatting tags | |
| if parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'i']: | |
| return True | |
| # Also check if parent's parent is a formatting tag | |
| if parent.parent and parent.parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: | |
| return True | |
| # Skip table of contents | |
| if parent: | |
| parent_text = str(parent).lower() | |
| if any(toc in parent_text for toc in ['table of contents', 'toc-', 'contents']): | |
| return True | |
| # Skip CTAs and buttons | |
| if parent and parent.name in ['button', 'a']: | |
| return True | |
| # Skip if parent has onclick or other event handlers | |
| if parent and parent.attrs: | |
| event_handlers = ['onclick', 'onchange', 'onsubmit', 'onload', 'onmouseover', 'onmouseout'] | |
| if any(handler in parent.attrs for handler in event_handlers): | |
| return True | |
| # Special check for testimonial cards - check up to 3 levels of ancestors | |
| if parent: | |
| ancestors_to_check = [] | |
| current = parent | |
| for _ in range(3): # Check up to 3 levels up | |
| if current: | |
| ancestors_to_check.append(current) | |
| current = current.parent | |
| # Check if any ancestor has testimonial-card class | |
| for ancestor in ancestors_to_check: | |
| if ancestor and ancestor.get('class'): | |
| classes = ancestor.get('class', []) | |
| if isinstance(classes, list): | |
| if any('testimonial-card' in str(cls) for cls in classes): | |
| return True | |
| elif isinstance(classes, str) and 'testimonial-card' in classes: | |
| return True | |
| # Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs | |
| skip_indicators = [ | |
| 'cta-', 'button', 'btn', 'heading', 'title', 'caption', | |
| 'toc-', 'contents', 'quiz', 'tip', 'note', 'alert', | |
| 'warning', 'info', 'success', 'error', 'code', 'pre', | |
| 'stats-grid', 'testimonial-card', 'highlight-box', | |
| 'cta-box', 'quiz-container', 'news-box', 'contact-form', | |
| 'faq-question', 'sidebar', 'widget', 'banner', 'news-section', | |
| 'author-intro', 'testimonial', 'review', 'feedback', | |
| 'floating-', 'stat-', 'progress-', 'option', 'results', | |
| 'question-container', 'quiz-', 'faq-', | |
| 'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown' | |
| ] | |
| # Check only immediate parent and grandparent (not all ancestors) | |
| elements_to_check = [parent] | |
| if parent and parent.parent: | |
| elements_to_check.append(parent.parent) | |
| for elem in elements_to_check: | |
| if not elem: | |
| continue | |
| # Check element's class | |
| elem_class = elem.get('class', []) | |
| if isinstance(elem_class, list): | |
| class_str = ' '.join(str(cls).lower() for cls in elem_class) | |
| if any(indicator in class_str for indicator in skip_indicators): | |
| return True | |
| # Check element's ID | |
| elem_id = elem.get('id', '') | |
| if any(indicator in str(elem_id).lower() for indicator in skip_indicators): | |
| return True | |
| # Skip short phrases that might be UI elements | |
| word_count = len(text.split()) | |
| if word_count <= 5: | |
| ui_patterns = [ | |
| 'click', 'download', 'learn more', 'read more', 'sign up', | |
| 'get started', 'try now', 'buy now', 'next', 'previous', | |
| 'back', 'continue', 'submit', 'cancel', 'get now', 'book your', | |
| 'check out:', 'see also:', 'related:', 'question', 'of' | |
| ] | |
| if any(pattern in text.lower() for pattern in ui_patterns): | |
| return True | |
| # Skip very short content in styled containers | |
| if parent and parent.name in ['div', 'section', 'aside', 'blockquote']: | |
| style = parent.get('style', '') | |
| if 'border' in style or 'background' in style: | |
| if word_count <= 20: | |
| # But don't skip if it's inside a paragraph | |
| if not any(p.name == 'p' for p in parent.parents): | |
| return True | |
| return False | |
| def is_likely_acronym_or_proper_noun(self, word): | |
| """Check if a word is likely an acronym or part of a proper noun""" | |
| # Common acronyms and abbreviations | |
| acronyms = {'MBA', 'CEO', 'USA', 'UK', 'GMAT', 'GRE', 'SAT', 'ACT', 'PhD', 'MD', 'IT', 'AI', 'ML'} | |
| # Check if it's in our acronym list | |
| if word.upper() in acronyms: | |
| return True | |
| # Check if it's all caps (likely acronym) | |
| if word.isupper() and len(word) > 1: | |
| return True | |
| # Check if it follows patterns like "Edition", "Focus", etc. that often come after proper nouns | |
| proper_noun_continuations = { | |
| 'Edition', 'Version', 'Series', 'Focus', 'System', 'Method', 'School', | |
| 'University', 'College', 'Institute', 'Academy', 'Center', 'Centre' | |
| } | |
| if word in proper_noun_continuations: | |
| return True | |
| return False | |
| def clean_model_output_enhanced(self, text): | |
| """Enhanced cleaning that preserves more natural structure""" | |
| if not text: | |
| return "" | |
| # Store original for fallback | |
| original = text | |
| # Remove ONLY clear model artifacts | |
| text = re.sub(r'^lexical\s*=\s*\d+\s*,\s*order\s*=\s*\d+\s*', '', text, flags=re.IGNORECASE) | |
| text = re.sub(r'<sent>\s*', '', text, flags=re.IGNORECASE) | |
| text = re.sub(r'\s*</sent>', '', text, flags=re.IGNORECASE) | |
| # Only remove clear prefixes | |
| if text.lower().startswith('paraphrase:'): | |
| text = text[11:].strip() | |
| elif text.lower().startswith('rewrite:'): | |
| text = text[8:].strip() | |
| # Clean up backticks that sometimes appear | |
| text = re.sub(r'``+', '', text) | |
| text = re.sub(r"''", '"', text) | |
| # Remove leading non-letter characters carefully | |
| # IMPORTANT: Preserve keyword placeholders | |
| if not re.match(r'^(__KW\d+__|KW\d+)', text): | |
| # Only remove if it doesn't start with a placeholder | |
| text = re.sub(r'^[^a-zA-Z_]+', '', text) | |
| # If we accidentally removed too much, use original | |
| if len(text) < len(original) * 0.5: | |
| text = original | |
| return text.strip() | |
| def paraphrase_with_dipper(self, text, lex_diversity=60, order_diversity=20, keywords=None): | |
| """Paraphrase text using Dipper model with sentence-level processing""" | |
| if not text or len(text.strip()) < 3: | |
| return text | |
| # Preserve keywords | |
| text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords) | |
| # Add debug logging | |
| if keyword_map: | |
| print(f"Debug: Created keyword map: {keyword_map}") | |
| print(f"Debug: Text with placeholders: {text_with_placeholders[:100]}...") | |
| # Split into sentences for better control | |
| sentences = self.split_into_sentences_advanced(text_with_placeholders) | |
| paraphrased_sentences = [] | |
| for sentence in sentences: | |
| if len(sentence.strip()) < 3: | |
| paraphrased_sentences.append(sentence) | |
| continue | |
| try: | |
| # Adjust diversity based on presence of keywords | |
| has_keywords = any(placeholder in sentence for placeholder in keyword_map.keys()) | |
| if has_keywords: | |
| # Use MODERATE diversity when keywords are present to avoid mangling | |
| lex_diversity = 40 # Reduced from 70 | |
| order_diversity = 10 # Reduced from 20 | |
| elif len(sentence.split()) < 10: | |
| lex_diversity = 70 # Reduced from 80 | |
| order_diversity = 25 # Reduced from 30 | |
| else: | |
| lex_diversity = 85 # Slightly reduced from 90 | |
| order_diversity = 35 # Slightly reduced from 40 | |
| lex_code = int(100 - lex_diversity) | |
| order_code = int(100 - order_diversity) | |
| # Format input for Dipper | |
| if self.is_dipper: | |
| input_text = f"lexical = {lex_code}, order = {order_code} <sent> {sentence} </sent>" | |
| else: | |
| input_text = f"paraphrase: {sentence}" | |
| # Tokenize | |
| inputs = self.tokenizer( | |
| input_text, | |
| return_tensors="pt", | |
| max_length=512, | |
| truncation=True, | |
| padding=True | |
| ) | |
| # Move to device | |
| if hasattr(self.model, 'device_map') and self.model.device_map: | |
| device = next(iter(self.model.device_map.values())) | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| else: | |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
| # Generate with appropriate variation based on keywords | |
| original_length = len(sentence.split()) | |
| max_new_length = int(original_length * 1.3) # Reduced from 1.4 | |
| # Adjust temperature based on keywords | |
| temp = 0.9 if has_keywords else 1.1 # Lower temp for keywords | |
| top_p_val = 0.95 if has_keywords else 0.9 | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_length=max_new_length + 20, | |
| min_length=max(5, int(original_length * 0.7)), | |
| do_sample=True, | |
| top_p=top_p_val, | |
| temperature=temp, | |
| no_repeat_ngram_size=3, | |
| num_beams=3 if has_keywords else 2, # More beams for stability with keywords | |
| early_stopping=True | |
| ) | |
| # Decode | |
| paraphrased = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Clean model artifacts | |
| paraphrased = self.clean_model_output_enhanced(paraphrased) | |
| # Fix incomplete sentences | |
| paraphrased = self.fix_incomplete_sentence_smart(paraphrased, sentence) | |
| # Ensure reasonable length | |
| if len(paraphrased.split()) > max_new_length: | |
| paraphrased = ' '.join(paraphrased.split()[:max_new_length]) | |
| paraphrased_sentences.append(paraphrased) | |
| except Exception as e: | |
| print(f"Error paraphrasing sentence: {str(e)}") | |
| paraphrased_sentences.append(sentence) | |
| # Join sentences back | |
| result = ' '.join(paraphrased_sentences) | |
| # Debug before restoration | |
| if keyword_map: | |
| print(f"Debug: Result before restoration: {result[:100]}...") | |
| print(f"Debug: Checking for placeholders...") | |
| for placeholder in keyword_map.keys(): | |
| if placeholder in result: | |
| print(f"Debug: Found placeholder {placeholder} in result") | |
| else: | |
| # Check for mangled versions | |
| if '___' in result: | |
| print(f"Debug: Found underscores ___ instead of {placeholder}") | |
| # Restore keywords AFTER joining all sentences | |
| result = self.restore_keywords_robust(result, keyword_map) | |
| # Debug after restoration | |
| if keyword_map: | |
| print(f"Debug: Result after restoration: {result[:100]}...") | |
| # Apply minimal grammar fixes with human variations | |
| result = self.grammar_fixer.smart_fix(result) | |
| return result | |
| def fix_incomplete_sentence_smart(self, generated, original): | |
| """Smarter sentence completion that maintains natural flow""" | |
| if not generated or not generated.strip(): | |
| return original | |
| generated = generated.strip() | |
| # Check if the sentence seems complete semantically | |
| words = generated.split() | |
| if len(words) >= 3: | |
| # Check if last word is a good ending word | |
| last_word = words[-1].lower().rstrip('.,!?;:') | |
| # Common ending words that might not need punctuation fix | |
| ending_words = { | |
| 'too', 'also', 'well', 'though', 'however', | |
| 'furthermore', 'moreover', 'indeed', 'anyway', | |
| 'regardless', 'nonetheless', 'therefore', 'thus' | |
| } | |
| # If it ends with a good word, just add appropriate punctuation | |
| if last_word in ending_words: | |
| if generated[-1] not in '.!?': | |
| generated += '.' | |
| return generated | |
| # Check for cut-off patterns | |
| if len(words) > 0: | |
| last_word = words[-1] | |
| # Remove if it's clearly cut off (1-2 chars, no vowels) | |
| # But don't remove valid short words like "is", "of", "to", etc. | |
| short_valid_words = {'is', 'of', 'to', 'in', 'on', 'at', 'by', 'or', 'if', 'so', 'up', 'no', 'we', 'he', 'me', 'be', 'do', 'go'} | |
| if (len(last_word) <= 2 and | |
| last_word.lower() not in short_valid_words and | |
| not any(c in 'aeiouAEIOU' for c in last_word)): | |
| words = words[:-1] | |
| generated = ' '.join(words) | |
| # Add ending punctuation based on context | |
| if generated and generated[-1] not in '.!?:,;': | |
| # Check original ending | |
| orig_stripped = original.strip() | |
| if orig_stripped.endswith('?'): | |
| # Check if generated seems like a question | |
| question_words = ['what', 'why', 'how', 'when', 'where', 'who', 'which', 'is', 'are', 'do', 'does', 'can', 'could', 'would', 'should'] | |
| first_word = generated.split()[0].lower() if generated.split() else '' | |
| if first_word in question_words: | |
| generated += '?' | |
| else: | |
| generated += '.' | |
| elif orig_stripped.endswith('!'): | |
| # Check if generated seems exclamatory | |
| exclaim_words = ['amazing', 'incredible', 'fantastic', 'terrible', 'awful', 'wonderful', 'excellent'] | |
| if any(word in generated.lower() for word in exclaim_words): | |
| generated += '!' | |
| else: | |
| generated += '.' | |
| elif orig_stripped.endswith(':'): | |
| generated += ':' | |
| else: | |
| generated += '.' | |
| # Ensure first letter is capitalized ONLY if it's sentence start | |
| # Don't capitalize words like "iPhone" or "eBay" or placeholders | |
| if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]) and not generated.startswith('__KW'): | |
| generated = generated[0].upper() + generated[1:] | |
| return generated | |
| def split_into_sentences_advanced(self, text): | |
| """Advanced sentence splitting using spaCy or NLTK""" | |
| if self.use_spacy and self.nlp: | |
| doc = self.nlp(text) | |
| sentences = [sent.text.strip() for sent in doc.sents] | |
| else: | |
| # Fallback to NLTK | |
| try: | |
| sentences = sent_tokenize(text) | |
| except: | |
| # Final fallback to regex | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| # Clean up sentences | |
| return [s for s in sentences if s and len(s.strip()) > 0] | |
| def paraphrase_with_bart(self, text, keywords=None): | |
| """Additional paraphrasing with BART for more variation""" | |
| if not self.use_bart or not text or len(text.strip()) < 3: | |
| return text | |
| try: | |
| # Preserve keywords | |
| text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords) | |
| # Process in smaller chunks for BART | |
| sentences = self.split_into_sentences_advanced(text_with_placeholders) | |
| paraphrased_sentences = [] | |
| for sentence in sentences: | |
| if len(sentence.split()) < 5: | |
| paraphrased_sentences.append(sentence) | |
| continue | |
| inputs = self.bart_tokenizer( | |
| sentence, | |
| return_tensors='pt', | |
| max_length=128, | |
| truncation=True | |
| ) | |
| # Move to appropriate device | |
| if hasattr(self.bart_model, 'device_map') and self.bart_model.device_map: | |
| device = next(iter(self.bart_model.device_map.values())) | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| else: | |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
| original_length = len(sentence.split()) | |
| with torch.no_grad(): | |
| outputs = self.bart_model.generate( | |
| **inputs, | |
| max_length=int(original_length * 1.4) + 10, | |
| min_length=max(5, int(original_length * 0.6)), | |
| num_beams=2, | |
| temperature=1.1, # Higher temperature | |
| do_sample=True, | |
| top_p=0.9, | |
| early_stopping=True | |
| ) | |
| paraphrased = self.bart_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Fix incomplete sentences | |
| paraphrased = self.fix_incomplete_sentence_smart(paraphrased, sentence) | |
| paraphrased_sentences.append(paraphrased) | |
| result = ' '.join(paraphrased_sentences) | |
| # Restore keywords AFTER joining all sentences | |
| result = self.restore_keywords_robust(result, keyword_map) | |
| # Apply minimal grammar fixes | |
| result = self.grammar_fixer.smart_fix(result) | |
| return result | |
| except Exception as e: | |
| print(f"Error in BART paraphrasing: {str(e)}") | |
| return text | |
| def apply_sentence_variation(self, text): | |
| """Apply natural sentence structure variations - MORE INTELLIGENT""" | |
| sentences = self.split_into_sentences_advanced(text) | |
| varied_sentences = [] | |
| for i, sentence in enumerate(sentences): | |
| # Skip empty sentences | |
| if not sentence.strip(): | |
| continue | |
| words = sentence.split() | |
| # Combine short sentences more often (50% chance) - BUT SMARTLY | |
| if (i < len(sentences) - 1 and | |
| len(words) < 15 and | |
| len(sentences[i+1].split()) < 15 and | |
| random.random() < 0.5): | |
| next_sent = sentences[i+1].strip() | |
| if next_sent: | |
| # Check if sentences are related (share common words or themes) | |
| current_words = set(w.lower() for w in words if len(w) > 3) | |
| next_words = set(w.lower() for w in next_sent.split() if len(w) > 3) | |
| # Only combine if they share context or one follows from the other | |
| if current_words & next_words or any(w in next_sent.lower() for w in ['this', 'that', 'these', 'those', 'it']): | |
| # Choose appropriate connector based on relationship | |
| if any(w in next_sent.lower().split()[:3] for w in ['however', 'but', 'yet', 'although']): | |
| connector = random.choice([', but', '; however,', ', yet', ' - though']) | |
| elif any(w in next_sent.lower().split()[:3] for w in ['therefore', 'thus', 'so', 'hence']): | |
| connector = random.choice([', so', '. Therefore,', ', which means', ' - thus']) | |
| elif any(w in next_sent.lower().split()[:3] for w in ['also', 'additionally', 'furthermore']): | |
| connector = random.choice([', and', '. Also,', '. Plus,', ' - additionally,']) | |
| else: | |
| connector = random.choice([', and', ', which', ' - ']) | |
| combined = f"{sentence.rstrip('.')}{connector} {next_sent[0].lower()}{next_sent[1:]}" | |
| varied_sentences.append(combined) | |
| sentences[i+1] = "" # Mark as processed | |
| else: | |
| varied_sentences.append(sentence) | |
| else: | |
| varied_sentences.append(sentence) | |
| elif sentence: # Only process non-empty sentences | |
| # Split very long sentences more intelligently | |
| if len(words) > 18: | |
| # Look for natural break points | |
| break_words = ['however', 'therefore', 'moreover', 'furthermore', 'additionally', 'consequently'] | |
| conjunctions = [', and', ', but', ', so', ', yet', ', for', ', or', ', nor'] | |
| # Find the best break point | |
| best_break = -1 | |
| for idx, word in enumerate(words): | |
| if word.lower().rstrip(',') in break_words and idx > len(words)//3: | |
| best_break = idx | |
| break | |
| # If no break word found, look for conjunctions | |
| if best_break == -1: | |
| text_lower = sentence.lower() | |
| for conj in conjunctions: | |
| if conj in text_lower: | |
| # Find position in word list | |
| conj_pos = text_lower.find(conj) | |
| word_count = len(text_lower[:conj_pos].split()) | |
| if len(words)//3 < word_count < 2*len(words)//3: | |
| best_break = word_count | |
| break | |
| # Split if good break point found | |
| if best_break > 0 and random.random() < 0.7: | |
| part1 = ' '.join(words[:best_break]) | |
| part2 = ' '.join(words[best_break:]) | |
| # Clean up punctuation | |
| part1 = part1.rstrip(',') + '.' | |
| # Capitalize second part appropriately | |
| if part2 and part2[0].islower() and not part2.startswith(('however', 'therefore', 'moreover')): | |
| part2 = part2[0].upper() + part2[1:] | |
| varied_sentences.append(part1) | |
| varied_sentences.append(part2) | |
| else: | |
| varied_sentences.append(sentence) | |
| else: | |
| # Add natural variations more often (35% chance) - BUT CONTEXTUALLY | |
| if i > 0 and random.random() < 0.35: | |
| # Check previous sentence ending to choose appropriate transition | |
| if varied_sentences and len(varied_sentences) > 0: | |
| prev_sent = varied_sentences[-1] | |
| # Choose transition based on relationship | |
| if any(w in sentence.lower() for w in ['however', 'but', 'although', 'despite']): | |
| transition = random.choice(['However, ', 'On the other hand, ', 'That said, ', 'Nevertheless, ']) | |
| elif any(w in sentence.lower() for w in ['example', 'instance', 'such as', 'like']): | |
| transition = random.choice(['For instance, ', 'For example, ', 'To illustrate, ', 'Consider this: ']) | |
| elif any(w in prev_sent.lower() for w in ['first', 'second', 'finally', 'lastly']): | |
| transition = random.choice(['Next, ', 'Additionally, ', 'Furthermore, ', 'Also, ']) | |
| else: | |
| transition = random.choice(['Furthermore, ', 'Additionally, ', 'Moreover, ', 'Also, ']) | |
| if sentence[0].isupper(): | |
| sentence = transition + sentence[0].lower() + sentence[1:] | |
| # Add mid-sentence interruptions (10% chance) - ONLY WHERE NATURAL | |
| if random.random() < 0.1 and len(words) > 12: | |
| # Find natural pause points (after commas, before "which", etc.) | |
| pause_points = [] | |
| for idx, word in enumerate(words): | |
| if word.endswith(',') and idx > len(words)//4 and idx < 3*len(words)//4: | |
| pause_points.append(idx + 1) | |
| elif word.lower() in ['which', 'that', 'who', 'where'] and idx > len(words)//3: | |
| pause_points.append(idx) | |
| if pause_points: | |
| pos = random.choice(pause_points) | |
| interruption = random.choice([ | |
| " - and this is important - ", | |
| " - mind you - ", | |
| " - interestingly - ", | |
| " (worth noting) ", | |
| " - by the way - " | |
| ]) | |
| words.insert(pos, interruption) | |
| sentence = ' '.join(words) | |
| varied_sentences.append(sentence) | |
| # Post-process for additional human patterns | |
| result = ' '.join([s for s in varied_sentences if s]) | |
| # Add occasional fragments for human touch (5% chance) - ONLY AT APPROPRIATE PLACES | |
| if random.random() < 0.05 and len(varied_sentences) > 3: | |
| sentences = result.split('. ') | |
| # Add fragment after sentences that set up for it | |
| for idx, sent in enumerate(sentences[:-1]): | |
| if any(w in sent.lower() for w in ['amazing', 'incredible', 'surprising', 'interesting']): | |
| fragments = ["Truly remarkable.", "Quite something.", "Really makes you think."] | |
| sentences.insert(idx + 1, random.choice(fragments)) | |
| break | |
| elif any(w in sent.lower() for w in ['difficult', 'challenging', 'complex', 'complicated']): | |
| fragments = ["Not easy, for sure.", "Tough stuff.", "Challenging indeed."] | |
| sentences.insert(idx + 1, random.choice(fragments)) | |
| break | |
| result = '. '.join(sentences) | |
| return result | |
| def fix_punctuation(self, text): | |
| """Comprehensive punctuation and formatting fixes""" | |
| if not text: | |
| return "" | |
| # First, clean any remaining model artifacts | |
| text = self.clean_model_output_enhanced(text) | |
| # Fix weird symbols and characters using safe replacements | |
| text = text.replace('<>', '') # Remove empty angle brackets | |
| # Normalize quotes - use replace instead of regex for problematic characters | |
| text = text.replace('«', '"').replace('»', '"') | |
| text = text.replace('„', '"').replace('"', '"').replace('"', '"') | |
| text = text.replace(''', "'").replace(''', "'") | |
| text = text.replace('–', '-').replace('—', '-') | |
| # Fix colon issues | |
| text = re.sub(r'\.:', ':', text) # Remove period before colon | |
| text = re.sub(r':\s*\.', ':', text) # Remove period after colon | |
| # Fix basic spacing | |
| text = re.sub(r'\s+', ' ', text) # Multiple spaces to single | |
| text = re.sub(r'\s+([.,!?;:])', r'\1', text) # Remove space before punctuation | |
| text = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', text) # Remove double punctuation | |
| text = re.sub(r'([.!?])\s*\1+', r'\1', text) # Remove repeated punctuation | |
| # Fix colons | |
| text = re.sub(r':\s*([.,!?])', ':', text) # Remove punctuation after colon | |
| text = re.sub(r'([.,!?])\s*:', ':', text) # Remove punctuation before colon | |
| text = re.sub(r':+', ':', text) # Multiple colons to one | |
| # Fix quotes and parentheses | |
| text = re.sub(r'"\s*([^"]*?)\s*"', r'"\1"', text) | |
| text = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", text) | |
| text = re.sub(r'\(\s*([^)]*?)\s*\)', r'(\1)', text) | |
| # Fix sentence capitalization more carefully | |
| # Split on ACTUAL sentence endings only | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| fixed_sentences = [] | |
| for i, sentence in enumerate(sentences): | |
| if not sentence: | |
| continue | |
| # Only capitalize the first letter if it's actually lowercase | |
| # and not part of a special case (like iPhone, eBay, etc.) | |
| words = sentence.split() | |
| if words: | |
| first_word = words[0] | |
| # Check if it's not an acronym or proper noun that should stay lowercase | |
| if (first_word[0].islower() and | |
| not self.is_likely_acronym_or_proper_noun(first_word) and | |
| not first_word.startswith('__KW') and | |
| not first_word.startswith('_kw')): | |
| # Only capitalize if it's a regular word | |
| sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:]) | |
| fixed_sentences.append(sentence) | |
| text = ' '.join(fixed_sentences) | |
| # Fix common issues | |
| text = re.sub(r'\bi\b', 'I', text) # Capitalize 'I' | |
| text = re.sub(r'\.{2,}', '.', text) # Multiple periods to one | |
| text = re.sub(r',{2,}', ',', text) # Multiple commas to one | |
| text = re.sub(r'\s*,\s*,\s*', ', ', text) # Double commas with spaces | |
| # Remove weird artifacts | |
| text = re.sub(r'\b(CHAPTER\s+[IVX]+|SECTION\s+\d+)\b[^\w]*', '', text, flags=re.IGNORECASE) | |
| # Fix abbreviations | |
| text = re.sub(r'\betc\s*\.\s*\.', 'etc.', text) | |
| text = re.sub(r'\be\.g\s*\.\s*[,\s]', 'e.g., ', text) | |
| text = re.sub(r'\bi\.e\s*\.\s*[,\s]', 'i.e., ', text) | |
| # Fix numbers with periods (like "1. " at start of lists) | |
| text = re.sub(r'(\d+)\.\s+', r'\1. ', text) | |
| # Fix bold/strong tags punctuation | |
| text = self.fix_bold_punctuation(text) | |
| # Clean up any remaining issues | |
| text = re.sub(r'\s+([.,!?;:])', r'\1', text) # Final space cleanup | |
| text = re.sub(r'([.,!?;:])\s{2,}', r'\1 ', text) # Fix multiple spaces after punctuation | |
| # Ensure ending punctuation | |
| text = text.strip() | |
| if text and text[-1] not in '.!?': | |
| # Don't add period if it ends with colon (likely a list header) | |
| if not text.endswith(':'): | |
| text += '.' | |
| return text | |
| def fix_bold_punctuation(self, text): | |
| """Fix punctuation issues around bold/strong tags""" | |
| # Check if this is likely a list item with colon pattern | |
| def is_list_item_with_colon(text): | |
| # Pattern: starts with or contains <strong>Text:</strong> or <b>Text:</b> | |
| list_pattern = r'^\s*(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>' | |
| return bool(re.search(list_pattern, text)) | |
| # If it's a list item with colon, preserve the format | |
| if is_list_item_with_colon(text): | |
| # Just clean up spacing but preserve the colon inside bold | |
| text = re.sub(r'<(strong|b)>\s*([^:]+)\s*:\s*</\1>', r'<\1>\2:</\1>', text) | |
| return text | |
| # Pattern to find bold/strong content | |
| bold_pattern = r'<(strong|b)>(.*?)</\1>' | |
| def fix_bold_match(match): | |
| tag = match.group(1) | |
| content = match.group(2).strip() | |
| if not content: | |
| return f'<{tag}></{tag}>' | |
| # Check if this is a list header (contains colon at the end) | |
| if content.endswith(':'): | |
| # Preserve list headers with colons | |
| return f'<{tag}>{content}</{tag}>' | |
| # Remove any periods at the start or end of bold content | |
| content = content.strip('.') | |
| # Check if this bold text is at the start of a sentence | |
| # (preceded by nothing, or by '. ', '! ', '? ') | |
| start_pos = match.start() | |
| is_sentence_start = (start_pos == 0 or | |
| (start_pos > 2 and text[start_pos-2:start_pos] in ['. ', '! ', '? ', '\n\n'])) | |
| # Capitalize first letter if it's at sentence start | |
| if is_sentence_start and content and content[0].isalpha(): | |
| content = content[0].upper() + content[1:] | |
| return f'<{tag}>{content}</{tag}>' | |
| # Fix bold/strong tags | |
| text = re.sub(bold_pattern, fix_bold_match, text) | |
| # Fix spacing around bold/strong tags (but not for list items) | |
| if not is_list_item_with_colon(text): | |
| text = re.sub(r'\.\s*<(strong|b)>', r'. <\1>', text) # Period before bold | |
| text = re.sub(r'</(strong|b)>\s*\.', r'</\1>.', text) # Period after bold | |
| text = re.sub(r'([.!?])\s*<(strong|b)>', r'\1 <\2>', text) # Space after sentence end | |
| text = re.sub(r'</(strong|b)>\s+([a-z])', lambda m: f'</{m.group(1)}> {m.group(2)}', text) # Keep lowercase after bold if mid-sentence | |
| # Remove duplicate periods around bold tags | |
| text = re.sub(r'\.\s*</(strong|b)>\s*\.', r'</\1>.', text) | |
| text = re.sub(r'\.\s*<(strong|b)>\s*\.', r'. <\1>', text) | |
| # Fix cases where bold content ends a sentence | |
| # If bold is followed by a new sentence (capital letter), add period | |
| text = re.sub(r'</(strong|b)>\s+([A-Z])', r'</\1>. \2', text) | |
| # Don't remove these for list items | |
| if not is_list_item_with_colon(text): | |
| text = re.sub(r'<(strong|b)>\s*:\s*</\1>', ':', text) # Remove empty bold colons | |
| text = re.sub(r'<(strong|b)>\s*\.\s*</\1>', '.', text) # Remove empty bold periods | |
| return text | |
| def extract_text_from_html(self, html_content): | |
| """Extract text elements from HTML with skip logic""" | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| text_elements = [] | |
| # Get all text nodes using string instead of text (fixing deprecation) | |
| for element in soup.find_all(string=True): | |
| # Skip script, style, and noscript content completely | |
| if element.parent.name in ['script', 'style', 'noscript']: | |
| continue | |
| text = element.strip() | |
| if text and not self.should_skip_element(element, text): | |
| text_elements.append({ | |
| 'text': text, | |
| 'element': element | |
| }) | |
| return soup, text_elements | |
| def validate_and_fix_html(self, html_text): | |
| """Fix common HTML syntax errors after processing""" | |
| # Fix DOCTYPE | |
| html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE) | |
| # Fix spacing issues | |
| html_text = re.sub(r'>\s+<', '><', html_text) # Remove extra spaces between tags | |
| html_text = re.sub(r'\s+>', '>', html_text) # Remove spaces before closing > | |
| html_text = re.sub(r'<\s+', '<', html_text) # Remove spaces after opening < | |
| # Fix common word errors that might occur during processing | |
| html_text = html_text.replace('down loaded', 'downloaded') | |
| html_text = html_text.replace('But your document', 'Your document') | |
| return html_text | |
| def wrap_keywords_in_paragraphs(self, soup, keywords): | |
| """Wrap keywords with <strong> tags inside <p> tags only""" | |
| if not keywords: | |
| return | |
| # Find all paragraph tags | |
| for p_tag in soup.find_all('p'): | |
| # Skip paragraphs that are inside special elements | |
| # Check if paragraph is inside any of these elements | |
| skip_parents = ['div.author-intro', 'div.cta-box', 'div.testimonial-card', | |
| 'div.news-box', 'button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', | |
| 'div.quiz-container', 'div.question-container', 'div.results'] | |
| # Check if this paragraph should be skipped | |
| should_skip = False | |
| for parent in p_tag.parents: | |
| # Check by class | |
| if parent.name == 'div' and parent.get('class'): | |
| classes = parent.get('class', []) | |
| if isinstance(classes, list): | |
| class_str = ' '.join(str(cls) for cls in classes) | |
| else: | |
| class_str = str(classes) | |
| if any(skip_class in class_str for skip_class in | |
| ['author-intro', 'cta-box', 'testimonial-card', 'news-box', | |
| 'quiz-container', 'question-container', 'results', 'stats-grid', | |
| 'toc-', 'comparison-tables']): | |
| should_skip = True | |
| break | |
| # Check by tag name | |
| if parent.name in ['button', 'a', 'blockquote', 'details', 'summary']: | |
| should_skip = True | |
| break | |
| if should_skip: | |
| continue | |
| # Additional check: Skip if paragraph has specific classes | |
| p_classes = p_tag.get('class', []) | |
| if isinstance(p_classes, list): | |
| p_class_str = ' '.join(str(cls) for cls in p_classes) | |
| else: | |
| p_class_str = str(p_classes) | |
| if any(skip_class in p_class_str for skip_class in ['testimonial-card', 'quiz-', 'stat-']): | |
| continue | |
| # Process only if this is a regular content paragraph | |
| # Get all text nodes in this paragraph | |
| for text_node in p_tag.find_all(string=True): | |
| # Skip if already inside a strong or b tag | |
| if text_node.parent.name in ['strong', 'b', 'em', 'i', 'span', 'a']: | |
| continue | |
| # Skip if the text node's immediate parent isn't the p tag | |
| # (to avoid nested elements) | |
| if text_node.parent != p_tag: | |
| continue | |
| original_text = str(text_node) | |
| # Skip very short text nodes | |
| if len(original_text.strip()) < 20: | |
| continue | |
| modified_text = original_text | |
| # Check each keyword | |
| for keyword in keywords: | |
| # Use word boundaries for accurate matching | |
| pattern = r'\b' + re.escape(keyword) + r'\b' | |
| # Find all matches (case-insensitive) | |
| matches = list(re.finditer(pattern, modified_text, flags=re.IGNORECASE)) | |
| # Replace from end to beginning to maintain positions | |
| for match in reversed(matches): | |
| start, end = match.span() | |
| matched_text = match.group(0) | |
| # Wrap with strong tag | |
| modified_text = (modified_text[:start] + | |
| f'<strong>{matched_text}</strong>' + | |
| modified_text[end:]) | |
| # If text was modified, replace the text node | |
| if modified_text != original_text: | |
| # Parse the modified text to create new nodes | |
| new_soup = BeautifulSoup(modified_text, 'html.parser') | |
| # Replace the text node with the new nodes | |
| for new_node in reversed(new_soup.contents): | |
| text_node.insert_after(new_node) | |
| text_node.extract() | |
| def add_natural_flow_variations(self, text): | |
| """Add more natural flow and rhythm variations for Originality AI""" | |
| sentences = self.split_into_sentences_advanced(text) | |
| enhanced_sentences = [] | |
| for i, sentence in enumerate(sentences): | |
| if not sentence.strip(): | |
| continue | |
| # Add stream-of-consciousness elements (10% chance) | |
| if random.random() < 0.1 and len(sentence.split()) > 10: | |
| stream_elements = [ | |
| " - wait, let me back up - ", | |
| " - actually, scratch that - ", | |
| " - or maybe I should say - ", | |
| " - hmm, how do I put this - ", | |
| " - okay, here's the thing - ", | |
| " - you know what I mean? - " | |
| ] | |
| words = sentence.split() | |
| pos = random.randint(len(words)//4, 3*len(words)//4) | |
| words.insert(pos, random.choice(stream_elements)) | |
| sentence = ' '.join(words) | |
| # Add human-like self-corrections (5% chance) | |
| if random.random() < 0.05: | |
| corrections = [ | |
| " - or rather, ", | |
| " - well, actually, ", | |
| " - I mean, ", | |
| " - or should I say, ", | |
| " - correction: " | |
| ] | |
| words = sentence.split() | |
| if len(words) > 8: | |
| pos = random.randint(len(words)//2, len(words)-3) | |
| correction = random.choice(corrections) | |
| # Repeat a concept with variation | |
| repeated_word_idx = random.randint(max(0, pos-5), pos-1) | |
| if repeated_word_idx < len(words): | |
| words.insert(pos, correction) | |
| sentence = ' '.join(words) | |
| # Add thinking-out-loud patterns (8% chance) | |
| if random.random() < 0.08 and i > 0: | |
| thinking_patterns = [ | |
| "Come to think of it, ", | |
| "Actually, you know what? ", | |
| "Wait, here's a thought: ", | |
| "Oh, and another thing - ", | |
| "Speaking of which, ", | |
| "This reminds me, ", | |
| "Now that I mention it, ", | |
| "Funny you should ask, because " | |
| ] | |
| pattern = random.choice(thinking_patterns) | |
| sentence = pattern + sentence[0].lower() + sentence[1:] if len(sentence) > 1 else sentence | |
| enhanced_sentences.append(sentence) | |
| return ' '.join(enhanced_sentences) | |
| def process_html(self, html_content, primary_keywords="", secondary_keywords="", progress_callback=None): | |
| """Main processing function with progress callback""" | |
| if not html_content.strip(): | |
| return "Please provide HTML content." | |
| # Store all script and style content to preserve it | |
| script_placeholder = "###SCRIPT_PLACEHOLDER_{}###" | |
| style_placeholder = "###STYLE_PLACEHOLDER_{}###" | |
| preserved_scripts = [] | |
| preserved_styles = [] | |
| # Temporarily replace script and style tags with placeholders | |
| soup_temp = BeautifulSoup(html_content, 'html.parser') | |
| # Preserve all script tags | |
| for idx, script in enumerate(soup_temp.find_all('script')): | |
| placeholder = script_placeholder.format(idx) | |
| preserved_scripts.append(str(script)) | |
| script.replace_with(placeholder) | |
| # Preserve all style tags | |
| for idx, style in enumerate(soup_temp.find_all('style')): | |
| placeholder = style_placeholder.format(idx) | |
| preserved_styles.append(str(style)) | |
| style.replace_with(placeholder) | |
| # Get the modified HTML | |
| html_content = str(soup_temp) | |
| # Combine keywords and clean them | |
| all_keywords = [] | |
| if primary_keywords: | |
| # Clean and validate each keyword | |
| for k in primary_keywords.split(','): | |
| cleaned = k.strip() | |
| if cleaned and len(cleaned) > 1: # Skip empty or single-char keywords | |
| all_keywords.append(cleaned) | |
| if secondary_keywords: | |
| for k in secondary_keywords.split(','): | |
| cleaned = k.strip() | |
| if cleaned and len(cleaned) > 1: | |
| all_keywords.append(cleaned) | |
| # Remove duplicates while preserving order | |
| seen = set() | |
| unique_keywords = [] | |
| for k in all_keywords: | |
| if k.lower() not in seen: | |
| seen.add(k.lower()) | |
| unique_keywords.append(k) | |
| all_keywords = unique_keywords | |
| try: | |
| # Extract text elements | |
| soup, text_elements = self.extract_text_from_html(html_content) | |
| total_elements = len(text_elements) | |
| print(f"Found {total_elements} text elements to process (after filtering)") | |
| if all_keywords: | |
| print(f"Preserving keywords: {all_keywords}") | |
| # Process each text element | |
| processed_count = 0 | |
| for i, element_info in enumerate(text_elements): | |
| original_text = element_info['text'] | |
| # Skip placeholders | |
| if "###SCRIPT_PLACEHOLDER_" in original_text or "###STYLE_PLACEHOLDER_" in original_text: | |
| continue | |
| # Skip very short texts | |
| if len(original_text.split()) < 3: | |
| continue | |
| # Debug: Check if keywords are in this text | |
| text_has_keywords = any(keyword.lower() in original_text.lower() for keyword in all_keywords) | |
| if text_has_keywords: | |
| print(f"Debug: Processing text with keywords: {original_text[:50]}...") | |
| # First pass with Dipper (with adjusted diversity) | |
| paraphrased_text = self.paraphrase_with_dipper( | |
| original_text, | |
| keywords=all_keywords | |
| ) | |
| # Verify no placeholders remain | |
| if '__KW' in paraphrased_text or '___' in paraphrased_text: | |
| print(f"Warning: Placeholder or underscores found in paraphrased text: {paraphrased_text[:100]}...") | |
| # Try to restore again with the enhanced function | |
| temp_map = {} | |
| for j, keyword in enumerate(all_keywords): | |
| temp_map[f'__KW{j:03d}__'] = keyword | |
| paraphrased_text = self.restore_keywords_robust(paraphrased_text, temp_map) | |
| # Second pass with BART for longer texts (increased probability) | |
| if self.use_bart and len(paraphrased_text.split()) > 8: | |
| # 50% chance to use BART for more variation (reduced from 60%) | |
| if random.random() < 0.5: | |
| paraphrased_text = self.paraphrase_with_bart( | |
| paraphrased_text, | |
| keywords=all_keywords | |
| ) | |
| # Apply sentence variation | |
| paraphrased_text = self.apply_sentence_variation(paraphrased_text) | |
| # Add natural flow variations | |
| paraphrased_text = self.add_natural_flow_variations(paraphrased_text) | |
| # Fix punctuation and formatting | |
| paraphrased_text = self.fix_punctuation(paraphrased_text) | |
| # Final check for any remaining placeholders or underscores | |
| if '___' in paraphrased_text or '__KW' in paraphrased_text: | |
| print(f"Error: Unresolved placeholders in final text") | |
| # Use original text if we can't resolve placeholders | |
| paraphrased_text = original_text | |
| # Final quality check | |
| if paraphrased_text and len(paraphrased_text.split()) >= 3: | |
| element_info['element'].replace_with(NavigableString(paraphrased_text)) | |
| processed_count += 1 | |
| # Progress update | |
| if progress_callback: | |
| progress_callback(i + 1, total_elements) | |
| if i % 10 == 0 or i == total_elements - 1: | |
| progress = (i + 1) / total_elements * 100 | |
| print(f"Progress: {progress:.1f}%") | |
| # Wrap keywords with <strong> tags in paragraphs | |
| self.wrap_keywords_in_paragraphs(soup, all_keywords) | |
| # Post-process the entire HTML to fix bold/strong formatting | |
| result = str(soup) | |
| result = self.post_process_html(result) | |
| # Final safety check for any remaining placeholders or underscores | |
| if '__KW' in result or re.search(r'_{3,}', result): | |
| print("Warning: Found placeholders or multiple underscores in final HTML output") | |
| # Attempt to clean them with keywords | |
| for i, keyword in enumerate(all_keywords): | |
| result = result.replace(f'__KW{i:03d}__', keyword) | |
| result = re.sub(r'_{3,}', keyword, result, count=1) | |
| # Restore all script tags | |
| for idx, script_content in enumerate(preserved_scripts): | |
| placeholder = script_placeholder.format(idx) | |
| result = result.replace(placeholder, script_content) | |
| # Restore all style tags | |
| for idx, style_content in enumerate(preserved_styles): | |
| placeholder = style_placeholder.format(idx) | |
| result = result.replace(placeholder, style_content) | |
| # Validate and fix HTML syntax | |
| result = self.validate_and_fix_html(result) | |
| # Count skipped elements properly | |
| all_text_elements = soup.find_all(string=True) | |
| skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements | |
| print(f"Successfully processed {processed_count} text elements") | |
| print(f"Skipped {skipped} elements (headings, CTAs, tables, testimonials, strong/bold tags, etc.)") | |
| print(f"Preserved {len(preserved_scripts)} script tags and {len(preserved_styles)} style tags") | |
| return result | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}" | |
| print(error_msg) | |
| # Return original HTML with error message prepended as HTML comment | |
| return f"<!-- {error_msg} -->\n{html_content}" | |
| def post_process_html(self, html_text): | |
| """Post-process the entire HTML to fix formatting issues""" | |
| # Fix empty angle brackets that might appear | |
| html_text = re.sub(r'<>\s*([^<>]+?)\s*(?=\.|\s|<)', r'\1', html_text) # Remove <> around text | |
| html_text = re.sub(r'<>', '', html_text) # Remove any remaining empty <> | |
| # Fix double angle brackets around bold tags | |
| html_text = re.sub(r'<<b>>', '<b>', html_text) | |
| html_text = re.sub(r'<</b>>', '</b>', html_text) | |
| html_text = re.sub(r'<<strong>>', '<strong>', html_text) | |
| html_text = re.sub(r'<</strong>>', '</strong>', html_text) | |
| # Fix periods around bold/strong tags | |
| html_text = re.sub(r'\.\s*<(b|strong)>', '. <\1>', html_text) # Period before bold | |
| html_text = re.sub(r'</(b|strong)>\s*\.', '</\1>.', html_text) # Period after bold | |
| html_text = re.sub(r'\.<<(b|strong)>>', '. <\1>', html_text) # Fix double bracket cases | |
| html_text = re.sub(r'</(b|strong)>>\.', '</\1>.', html_text) | |
| # Fix periods after colons | |
| html_text = re.sub(r':\s*\.', ':', html_text) | |
| html_text = re.sub(r'\.:', ':', html_text) | |
| # Check if a line is a list item | |
| def process_line(line): | |
| # Check if this line contains a list pattern with bold | |
| list_pattern = r'(?:^|\s)(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>' | |
| if re.search(list_pattern, line): | |
| # This is a list item, preserve the colon format | |
| return line | |
| # Not a list item, apply regular fixes | |
| # Remove periods immediately inside bold tags | |
| line = re.sub(r'<(strong|b)>\s*\.\s*([^<]+)\s*\.\s*</\1>', r'<\1>\2</\1>', line) | |
| # Fix sentence endings with bold | |
| line = re.sub(r'</(strong|b)>\s*([.!?])', r'</\1>\2', line) | |
| return line | |
| # Process line by line to preserve list formatting | |
| lines = html_text.split('\n') | |
| processed_lines = [process_line(line) for line in lines] | |
| html_text = '\n'.join(processed_lines) | |
| # Fix sentence starts with bold | |
| def fix_bold_sentence_start(match): | |
| pre_context = match.group(1) | |
| tag = match.group(2) | |
| content = match.group(3) | |
| # Skip if this is part of a list item with colon | |
| full_match = match.group(0) | |
| if ':' in full_match and '</' + tag + '>' in full_match: | |
| return full_match | |
| # Check if this should start with capital | |
| if pre_context == '' or pre_context.endswith(('.', '!', '?', '>')): | |
| if content and content[0].islower(): | |
| content = content[0].upper() + content[1:] | |
| return f'{pre_context}<{tag}>{content}' | |
| # Look for bold/strong tags and check their context | |
| html_text = re.sub(r'(^|.*?)(<(?:strong|b)>)([a-zA-Z])', fix_bold_sentence_start, html_text) | |
| # Clean up spacing around bold tags (but preserve list formatting) | |
| # Split into segments to handle list items separately | |
| segments = re.split(r'(<(?:strong|b)>[^<]*:</(?:strong|b)>)', html_text) | |
| cleaned_segments = [] | |
| for i, segment in enumerate(segments): | |
| if i % 2 == 1: # This is a list item pattern | |
| cleaned_segments.append(segment) | |
| else: | |
| # Apply spacing fixes to non-list segments | |
| segment = re.sub(r'\s+<(strong|b)>', r' <\1>', segment) | |
| segment = re.sub(r'</(strong|b)>\s+', r'</\1> ', segment) | |
| # Fix punctuation issues | |
| segment = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', segment) | |
| # Fix periods inside/around bold | |
| segment = re.sub(r'\.<(strong|b)>\.', '. <\1>', segment) | |
| segment = re.sub(r'\.</(strong|b)>\.', '</\1>.', segment) | |
| cleaned_segments.append(segment) | |
| html_text = ''.join(cleaned_segments) | |
| # Final cleanup | |
| html_text = re.sub(r'\.{2,}', '.', html_text) # Multiple periods | |
| html_text = re.sub(r',{2,}', ',', html_text) # Multiple commas | |
| html_text = re.sub(r':{2,}', ':', html_text) # Multiple colons | |
| html_text = re.sub(r'\s+([.,!?;:])', r'\1', html_text) # Space before punctuation | |
| # Fix empty bold tags (but not those with just colons) | |
| html_text = re.sub(r'<(strong|b)>\s*</\1>', '', html_text) | |
| # Fix specific patterns in lists/stats | |
| # Pattern like "5,000+" should not have period after | |
| html_text = re.sub(r'(\d+[,\d]*\+?)\s*\.\s*\n', r'\1\n', html_text) | |
| # Clean up any remaining double brackets | |
| html_text = re.sub(r'<<', '<', html_text) | |
| html_text = re.sub(r'>>', '>', html_text) | |
| # Apply final minimal grammar fixes | |
| html_text = self.grammar_fixer.smart_fix(html_text) | |
| return html_text | |
| # Initialize the humanizer | |
| humanizer = EnhancedDipperHumanizer() | |
| def humanize_html(html_input, primary_keywords="", secondary_keywords="", progress=gr.Progress()): | |
| """Gradio interface function with progress updates""" | |
| if not html_input: | |
| return "Please provide HTML content to humanize." | |
| progress(0, desc="Starting processing...") | |
| start_time = time.time() | |
| # Create a wrapper to update progress | |
| def progress_callback(current, total): | |
| if total > 0: | |
| progress(current / total, desc=f"Processing: {current}/{total} elements") | |
| # Pass progress callback to process_html | |
| result = humanizer.process_html( | |
| html_input, | |
| primary_keywords, | |
| secondary_keywords, | |
| progress_callback=progress_callback | |
| ) | |
| processing_time = time.time() - start_time | |
| print(f"Processing completed in {processing_time:.2f} seconds") | |
| progress(1.0, desc="Complete!") | |
| return result | |
| # Create Gradio interface with queue | |
| iface = gr.Interface( | |
| fn=humanize_html, | |
| inputs=[ | |
| gr.Textbox( | |
| lines=10, | |
| placeholder="Paste your HTML content here...", | |
| label="HTML Input" | |
| ), | |
| gr.Textbox( | |
| placeholder="Enter primary keywords separated by commas (e.g., GMAT Focus Edition, MBA, Data Insights)", | |
| label="Primary Keywords (preserved exactly)" | |
| ), | |
| gr.Textbox( | |
| placeholder="Enter secondary keywords separated by commas (e.g., test preparation, business school)", | |
| label="Secondary Keywords (preserved exactly)" | |
| ) | |
| ], | |
| outputs=gr.Textbox( | |
| lines=10, | |
| label="Humanized HTML Output" | |
| ), | |
| title="Enhanced Dipper AI Humanizer - Optimized for Originality AI", | |
| description=""" | |
| Ultra-aggressive humanizer optimized to achieve 100% human scores on both Undetectable AI and Originality AI. | |
| Key Features: | |
| - Maximum diversity settings (90% lexical, 40% order) for natural variation | |
| - Enhanced human patterns: personal opinions, self-corrections, thinking-out-loud | |
| - Natural typos, contractions, and conversational flow | |
| - Stream-of-consciousness elements and rhetorical questions | |
| - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions | |
| - Fixed placeholder system that preserves keywords | |
| - Keywords inside <p> tags are automatically wrapped with <strong> tags | |
| - Skips content in <strong>, <b>, and heading tags (including inside tables) | |
| - Designed to pass the strictest AI detection systems | |
| The tool creates genuinely human-like writing patterns that fool even the most sophisticated detectors! | |
| ⚠️ Note: Processing may take 5-10 minutes for large HTML documents. | |
| """, | |
| examples=[ | |
| ["""<article> | |
| <h1>The Benefits of Regular Exercise</h1> | |
| <div class="author-intro">By John Doe, Fitness Expert | 10 years experience</div> | |
| <p>Regular exercise is essential for maintaining good health. It helps improve cardiovascular fitness, strengthens muscles, and enhances mental well-being. Studies have shown that people who exercise regularly have lower risks of chronic diseases.</p> | |
| <p>Additionally, exercise can boost mood and energy levels. It releases endorphins, which are natural mood elevators. Even moderate activities like walking can make a significant difference in overall health.</p> | |
| </article>""", "cardiovascular fitness, mental well-being, chronic diseases", "exercise, health, endorphins"] | |
| ], | |
| theme="default" | |
| ) | |
| if __name__ == "__main__": | |
| # Enable queue for better handling of long-running processes | |
| iface.queue(max_size=10) | |
| iface.launch(share=True) |