Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Aug 1

Commit

870e374

verified ·

1 Parent(s): dc950b9

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -194

app.py CHANGED Viewed

@@ -352,6 +352,26 @@ class HumanLikeVariations:
             return variation(sentence)
         except:
             return sentence
 class SelectiveGrammarFixer:
     """Minimal grammar fixes to maintain human-like quality while fixing critical errors"""
@@ -642,69 +662,6 @@ class EnhancedDipperHumanizer:
         return text
-    def preserve_keywords(self, text, keywords):
-        """Mark keywords to preserve them during paraphrasing - SIMPLIFIED"""
-        if not keywords:
-            return text, {}
-        # Create a mapping of placeholders to keywords
-        keyword_map = {}
-        modified_text = text
-        # Sort keywords by length (longest first) to avoid partial replacements
-        sorted_keywords = sorted(keywords, key=len, reverse=True)
-        for i, keyword in enumerate(sorted_keywords):
-            # Use unique markers that won't be confused
-            placeholder = f"KWPH{i:04d}"  # e.g., KWPH0001
-            # Find all occurrences of the keyword (case-insensitive)
-            pattern = r'\b' + re.escape(keyword) + r'\b'
-            matches = list(re.finditer(pattern, modified_text, flags=re.IGNORECASE))
-            if matches:
-                # Replace all occurrences with the placeholder
-                for match in reversed(matches):  # Reverse to maintain positions
-                    original_keyword = match.group(0)
-                    start, end = match.span()
-                    modified_text = modified_text[:start] + placeholder + modified_text[end:]
-                    # Store the original case version
-                    if placeholder not in keyword_map:
-                        keyword_map[placeholder] = original_keyword
-        return modified_text, keyword_map
-    def restore_keywords_robust(self, text, keyword_map):
-        """Restore keywords with simple direct replacement"""
-        if not keyword_map:
-            return text
-        restored_text = text
-        # Simple direct replacement
-        for placeholder, keyword in keyword_map.items():
-            # Direct replacement
-            restored_text = restored_text.replace(placeholder, keyword)
-            # Also try with potential variations that might occur
-            restored_text = restored_text.replace(f" {placeholder} ", f" {keyword} ")
-            restored_text = restored_text.replace(f"{placeholder}.", f"{keyword}.")
-            restored_text = restored_text.replace(f"{placeholder},", f"{keyword},")
-            restored_text = restored_text.replace(f"{placeholder}!", f"{keyword}!")
-            restored_text = restored_text.replace(f"{placeholder}?", f"{keyword}?")
-            restored_text = restored_text.replace(f"{placeholder}:", f"{keyword}:")
-            restored_text = restored_text.replace(f"{placeholder};", f"{keyword};")
-            restored_text = restored_text.replace(f"({placeholder})", f"({keyword})")
-            restored_text = restored_text.replace(f'"{placeholder}"', f'"{keyword}"')
-            restored_text = restored_text.replace(f"'{placeholder}'", f"'{keyword}'")
-            # Handle case variations
-            restored_text = restored_text.replace(placeholder.lower(), keyword)
-            restored_text = restored_text.replace(placeholder.upper(), keyword)
-            restored_text = restored_text.replace(placeholder.capitalize(), keyword)
-        return restored_text.strip()
     def should_skip_element(self, element, text):
         """Determine if an element should be skipped from paraphrasing"""
         if not text or len(text.strip()) < 3:
@@ -895,10 +852,7 @@ class EnhancedDipperHumanizer:
         text = re.sub(r'\s+', ' ', text)
         # Remove leading non-letter characters carefully
-        # IMPORTANT: Preserve keyword placeholders
-        if not re.match(r'^(KWPH\d+)', text):
-            # Only remove if it doesn't start with a placeholder
-            text = re.sub(r'^[^a-zA-Z_]+', '', text)
         # If we accidentally removed too much, use original
         if len(text) < len(original) * 0.5:
@@ -906,16 +860,13 @@ class EnhancedDipperHumanizer:
         return text.strip()
-    def paraphrase_with_dipper(self, text, lex_diversity=60, order_diversity=20, keywords=None):
         """Paraphrase text using Dipper model with sentence-level processing"""
         if not text or len(text.strip()) < 3:
             return text
-        # Preserve keywords
-        text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
         # Split into sentences for better control
-        sentences = self.split_into_sentences_advanced(text_with_placeholders)
         paraphrased_sentences = []
         # Track sentence patterns to avoid repetition
@@ -928,11 +879,7 @@ class EnhancedDipperHumanizer:
             try:
                 # ULTRA-HIGH diversity for Originality AI
-                has_keywords = any(placeholder in sentence for placeholder in keyword_map.keys())
-                if has_keywords:
-                    lex_diversity = 30  # Moderate for keywords
-                    order_diversity = 10
-                elif len(sentence.split()) < 10:
                     lex_diversity = 40  # Very high for short
                     order_diversity = 15
                 else:
@@ -969,7 +916,7 @@ class EnhancedDipperHumanizer:
                 max_new_length = int(original_length * 1.4)
                 # High variation parameters
-                temp = 0.7 if has_keywords else 0.8
                 top_p_val = 0.9
                 with torch.no_grad():
@@ -1017,9 +964,6 @@ class EnhancedDipperHumanizer:
         # Join sentences back
         result = ' '.join(paraphrased_sentences)
-        # Restore keywords AFTER joining all sentences
-        result = self.restore_keywords_robust(result, keyword_map)
         # Apply natural human patterns
         result = self.add_natural_human_patterns(result)
@@ -1089,8 +1033,8 @@ class EnhancedDipperHumanizer:
                 generated += '.'
         # Ensure first letter is capitalized ONLY if it's sentence start
-        # Don't capitalize words like "iPhone" or "eBay" or placeholders
-        if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]) and not generated.startswith('KWPH'):
             generated = generated[0].upper() + generated[1:]
         return generated
@@ -1111,17 +1055,14 @@ class EnhancedDipperHumanizer:
         # Clean up sentences
         return [s for s in sentences if s and len(s.strip()) > 0]
-    def paraphrase_with_bart(self, text, keywords=None):
         """Additional paraphrasing with BART for more variation"""
         if not self.use_bart or not text or len(text.strip()) < 3:
             return text
         try:
-            # Preserve keywords
-            text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
             # Process in smaller chunks for BART
-            sentences = self.split_into_sentences_advanced(text_with_placeholders)
             paraphrased_sentences = []
             for sentence in sentences:
@@ -1166,9 +1107,6 @@ class EnhancedDipperHumanizer:
             result = ' '.join(paraphrased_sentences)
-            # Restore keywords AFTER joining all sentences
-            result = self.restore_keywords_robust(result, keyword_map)
             # Apply minimal grammar fixes
             result = self.grammar_fixer.smart_fix(result)
@@ -1279,8 +1217,7 @@ class EnhancedDipperHumanizer:
                 first_word = words[0]
                 # Check if it's not an acronym or proper noun that should stay lowercase
                 if (first_word[0].islower() and
-                    not self.is_likely_acronym_or_proper_noun(first_word) and
-                    not first_word.startswith('KWPH')):
                     # Only capitalize if it's a regular word
                     sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
@@ -1427,57 +1364,6 @@ class EnhancedDipperHumanizer:
         return html_text
-    def wrap_keywords_in_bold(self, html_content, keywords):
-        """Wrap all keyword occurrences with <strong> tags - FIXED VERSION"""
-        if not keywords:
-            return html_content
-        # Parse the HTML
-        soup = BeautifulSoup(html_content, 'html.parser')
-        # Process each keyword
-        for keyword in keywords:
-            # Find all text nodes that contain this keyword
-            for element in soup.find_all(string=re.compile(re.escape(keyword), re.IGNORECASE)):
-                # Skip if already inside certain tags
-                parent = element.parent
-                if parent and parent.name in ['script', 'style', 'strong', 'b', 'a', 'button',
-                                            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
-                    continue
-                # Get the text content
-                text = str(element)
-                # Replace all occurrences of the keyword with <strong> wrapped version
-                # Use a regex to preserve the original case
-                pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
-                # Find all matches
-                matches = list(pattern.finditer(text))
-                if matches:
-                    # Build the new text with <strong> tags
-                    new_text = ""
-                    last_end = 0
-                    for match in matches:
-                        # Add text before the match
-                        new_text += text[last_end:match.start()]
-                        # Add the wrapped keyword (preserving original case)
-                        new_text += f"<strong>{match.group(0)}</strong>"
-                        last_end = match.end()
-                    # Add remaining text
-                    new_text += text[last_end:]
-                    # Replace the text node with new HTML
-                    new_soup = BeautifulSoup(new_text, 'html.parser')
-                    for new_element in reversed(list(new_soup.contents)):
-                        element.insert_after(new_element)
-                    element.extract()
-        return str(soup)
     def add_natural_flow_variations(self, text):
         """Add more natural flow and rhythm variations for Originality AI"""
         sentences = self.split_into_sentences_advanced(text)
@@ -1540,7 +1426,7 @@ class EnhancedDipperHumanizer:
         return ' '.join(enhanced_sentences)
-    def process_html(self, html_content, primary_keywords="", secondary_keywords="", progress_callback=None):
         """Main processing function with progress callback"""
         if not html_content.strip():
             return "Please provide HTML content."
@@ -1569,37 +1455,12 @@ class EnhancedDipperHumanizer:
         # Get the modified HTML
         html_content = str(soup_temp)
-        # Combine keywords and clean them
-        all_keywords = []
-        if primary_keywords:
-            # Clean and validate each keyword
-            for k in primary_keywords.split(','):
-                cleaned = k.strip()
-                if cleaned and len(cleaned) > 1:  # Skip empty or single-char keywords
-                    all_keywords.append(cleaned)
-        if secondary_keywords:
-            for k in secondary_keywords.split(','):
-                cleaned = k.strip()
-                if cleaned and len(cleaned) > 1:
-                    all_keywords.append(cleaned)
-        # Remove duplicates while preserving order
-        seen = set()
-        unique_keywords = []
-        for k in all_keywords:
-            if k.lower() not in seen:
-                seen.add(k.lower())
-                unique_keywords.append(k)
-        all_keywords = unique_keywords
         try:
             # Extract text elements
             soup, text_elements = self.extract_text_from_html(html_content)
             total_elements = len(text_elements)
             print(f"Found {total_elements} text elements to process (after filtering)")
-            if all_keywords:
-                print(f"Preserving keywords: {all_keywords}")
             # Process each text element
             processed_count = 0
@@ -1615,20 +1476,18 @@ class EnhancedDipperHumanizer:
                 if len(original_text.split()) < 3:
                     continue
-                # First pass with Dipper (with adjusted diversity)
                 paraphrased_text = self.paraphrase_with_dipper(
                     original_text,
-                    keywords=all_keywords
                 )
                 # Second pass with BART for longer texts (increased probability)
                 if self.use_bart and len(paraphrased_text.split()) > 8:
-                    # 50% chance to use BART for more variation (reduced from 60%)
                     if random.random() < 0.2:
-                        paraphrased_text = self.paraphrase_with_bart(
-                            paraphrased_text,
-                            keywords=all_keywords
-                        )
                 # Apply sentence variation
                 paraphrased_text = self.apply_sentence_variation(paraphrased_text)
@@ -1665,9 +1524,6 @@ class EnhancedDipperHumanizer:
                 placeholder = style_placeholder.format(idx)
                 result = result.replace(placeholder, style_content)
-            # NOW wrap keywords in bold tags after all processing is complete
-            result = self.wrap_keywords_in_bold(result, all_keywords)
             # Post-process the entire HTML to fix bold/strong formatting
             result = self.post_process_html(result)
@@ -1802,7 +1658,7 @@ class EnhancedDipperHumanizer:
 # Initialize the humanizer
 humanizer = EnhancedDipperHumanizer()
-def humanize_html(html_input, primary_keywords="", secondary_keywords="", progress=gr.Progress()):
     """Gradio interface function with progress updates"""
     if not html_input:
         return "Please provide HTML content to humanize."
@@ -1818,8 +1674,6 @@ def humanize_html(html_input, primary_keywords="", secondary_keywords="", progre
     # Pass progress callback to process_html
     result = humanizer.process_html(
         html_input,
-        primary_keywords,
-        secondary_keywords,
         progress_callback=progress_callback
     )
@@ -1837,14 +1691,6 @@ iface = gr.Interface(
             lines=10,
             placeholder="Paste your HTML content here...",
             label="HTML Input"
-        ),
-        gr.Textbox(
-            placeholder="Enter primary keywords separated by commas (e.g., GMAT Focus Edition, MBA, Data Insights)",
-            label="Primary Keywords (preserved exactly)"
-        ),
-        gr.Textbox(
-            placeholder="Enter secondary keywords separated by commas (e.g., test preparation, business school)",
-            label="Secondary Keywords (preserved exactly)"
         )
     ],
     outputs=gr.Textbox(
@@ -1861,8 +1707,6 @@ iface = gr.Interface(
     - Natural typos, contractions, and conversational flow
     - Stream-of-consciousness elements and rhetorical questions
     - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
-    - Fixed placeholder system that preserves keywords
-    - Keywords are automatically wrapped with <strong> tags
     - Skips content in <strong>, <b>, and heading tags (including inside tables)
     - Designed to pass the strictest AI detection systems
@@ -1876,7 +1720,7 @@ iface = gr.Interface(
 <div class="author-intro">By John Doe, Fitness Expert | 10 years experience</div>
 <p>Regular exercise is essential for maintaining good health. It helps improve cardiovascular fitness, strengthens muscles, and enhances mental well-being. Studies have shown that people who exercise regularly have lower risks of chronic diseases.</p>
 <p>Additionally, exercise can boost mood and energy levels. It releases endorphins, which are natural mood elevators. Even moderate activities like walking can make a significant difference in overall health.</p>
-</article>""", "cardiovascular fitness, mental well-being, chronic diseases", "exercise, health, endorphins"]
     ],
     theme="default"
 )

             return variation(sentence)
         except:
             return sentence
+    def split_into_sentences_advanced(self, text):
+        """Advanced sentence splitting using spaCy or NLTK"""
+        if SPACY_AVAILABLE:
+            try:
+                nlp = spacy.load("en_core_web_sm")
+                doc = nlp(text)
+                sentences = [sent.text.strip() for sent in doc.sents]
+            except:
+                sentences = sent_tokenize(text)
+        else:
+            # Fallback to NLTK
+            try:
+                sentences = sent_tokenize(text)
+            except:
+                # Final fallback to regex
+                sentences = re.split(r'(?<=[.!?])\s+', text)
+        # Clean up sentences
+        return [s for s in sentences if s and len(s.strip()) > 0]
 class SelectiveGrammarFixer:
     """Minimal grammar fixes to maintain human-like quality while fixing critical errors"""
         return text
     def should_skip_element(self, element, text):
         """Determine if an element should be skipped from paraphrasing"""
         if not text or len(text.strip()) < 3:
         text = re.sub(r'\s+', ' ', text)
         # Remove leading non-letter characters carefully
+        text = re.sub(r'^[^a-zA-Z_]+', '', text)
         # If we accidentally removed too much, use original
         if len(text) < len(original) * 0.5:
         return text.strip()
+    def paraphrase_with_dipper(self, text, lex_diversity=60, order_diversity=20):
         """Paraphrase text using Dipper model with sentence-level processing"""
         if not text or len(text.strip()) < 3:
             return text
         # Split into sentences for better control
+        sentences = self.split_into_sentences_advanced(text)
         paraphrased_sentences = []
         # Track sentence patterns to avoid repetition
             try:
                 # ULTRA-HIGH diversity for Originality AI
+                if len(sentence.split()) < 10:
                     lex_diversity = 40  # Very high for short
                     order_diversity = 15
                 else:
                 max_new_length = int(original_length * 1.4)
                 # High variation parameters
+                temp = 0.8
                 top_p_val = 0.9
                 with torch.no_grad():
         # Join sentences back
         result = ' '.join(paraphrased_sentences)
         # Apply natural human patterns
         result = self.add_natural_human_patterns(result)
                 generated += '.'
         # Ensure first letter is capitalized ONLY if it's sentence start
+        # Don't capitalize words like "iPhone" or "eBay"
+        if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]):
             generated = generated[0].upper() + generated[1:]
         return generated
         # Clean up sentences
         return [s for s in sentences if s and len(s.strip()) > 0]
+    def paraphrase_with_bart(self, text):
         """Additional paraphrasing with BART for more variation"""
         if not self.use_bart or not text or len(text.strip()) < 3:
             return text
         try:
             # Process in smaller chunks for BART
+            sentences = self.split_into_sentences_advanced(text)
             paraphrased_sentences = []
             for sentence in sentences:
             result = ' '.join(paraphrased_sentences)
             # Apply minimal grammar fixes
             result = self.grammar_fixer.smart_fix(result)
                 first_word = words[0]
                 # Check if it's not an acronym or proper noun that should stay lowercase
                 if (first_word[0].islower() and
+                    not self.is_likely_acronym_or_proper_noun(first_word)):
                     # Only capitalize if it's a regular word
                     sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
         return html_text
     def add_natural_flow_variations(self, text):
         """Add more natural flow and rhythm variations for Originality AI"""
         sentences = self.split_into_sentences_advanced(text)
         return ' '.join(enhanced_sentences)
+    def process_html(self, html_content, progress_callback=None):
         """Main processing function with progress callback"""
         if not html_content.strip():
             return "Please provide HTML content."
         # Get the modified HTML
         html_content = str(soup_temp)
         try:
             # Extract text elements
             soup, text_elements = self.extract_text_from_html(html_content)
             total_elements = len(text_elements)
             print(f"Found {total_elements} text elements to process (after filtering)")
             # Process each text element
             processed_count = 0
                 if len(original_text.split()) < 3:
                     continue
+                # First pass with Dipper
                 paraphrased_text = self.paraphrase_with_dipper(
                     original_text,
+                    lex_diversity=60,
+                    order_diversity=20
                 )
                 # Second pass with BART for longer texts (increased probability)
                 if self.use_bart and len(paraphrased_text.split()) > 8:
+                    # 50% chance to use BART for more variation
                     if random.random() < 0.2:
+                        paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
                 # Apply sentence variation
                 paraphrased_text = self.apply_sentence_variation(paraphrased_text)
                 placeholder = style_placeholder.format(idx)
                 result = result.replace(placeholder, style_content)
             # Post-process the entire HTML to fix bold/strong formatting
             result = self.post_process_html(result)
 # Initialize the humanizer
 humanizer = EnhancedDipperHumanizer()
+def humanize_html(html_input, progress=gr.Progress()):
     """Gradio interface function with progress updates"""
     if not html_input:
         return "Please provide HTML content to humanize."
     # Pass progress callback to process_html
     result = humanizer.process_html(
         html_input,
         progress_callback=progress_callback
     )
             lines=10,
             placeholder="Paste your HTML content here...",
             label="HTML Input"
         )
     ],
     outputs=gr.Textbox(
     - Natural typos, contractions, and conversational flow
     - Stream-of-consciousness elements and rhetorical questions
     - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
     - Skips content in <strong>, <b>, and heading tags (including inside tables)
     - Designed to pass the strictest AI detection systems
 <div class="author-intro">By John Doe, Fitness Expert | 10 years experience</div>
 <p>Regular exercise is essential for maintaining good health. It helps improve cardiovascular fitness, strengthens muscles, and enhances mental well-being. Studies have shown that people who exercise regularly have lower risks of chronic diseases.</p>
 <p>Additionally, exercise can boost mood and energy levels. It releases endorphins, which are natural mood elevators. Even moderate activities like walking can make a significant difference in overall health.</p>
+</article>"""]
     ],
     theme="default"
 )