Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Aug 4

Commit

942fc21

verified ·

1 Parent(s): d97439e

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -171

app.py CHANGED Viewed

@@ -663,150 +663,122 @@ class EnhancedDipperHumanizer:
         return text
     def should_skip_element(self, element, text):
-        """Determine if an element should be skipped from paraphrasing"""
-        if not text or len(text.strip()) < 3:
             return True
-        # Skip JavaScript code inside script tags
-        parent = element.parent
-        if parent and parent.name in ['script', 'style', 'noscript']:
-            return True
-        # Skip inline JavaScript (onclick, onchange, etc.)
-        if parent and parent.attrs:
-            for attr_name, attr_value in parent.attrs.items():
-                if attr_name.startswith('on') and 'selectOption' in str(attr_value):
-                    return True
-        # Skip if text contains JavaScript function calls
-        if 'function' in text or 'selectOption' in text or '=>' in text:
-            return True
-        # Skip headings (h1-h6)
-        if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
-            return True
-        # Skip content inside <strong> and <b> tags
-        if parent and parent.name in ['strong', 'b']:
-            return True
-        # Skip table content
-        if parent and (parent.name in ['td', 'th'] or any(p.name == 'table' for p in parent.parents)):
-            return True
-        # Special handling for content inside tables
-        # Skip if it's inside strong/b/h1-h6 tags AND also inside a table
-        if parent:
-            # Check if we're inside a table
-            is_in_table = any(p.name == 'table' for p in parent.parents)
-            if is_in_table:
-                # If we're in a table, skip any text that's inside formatting tags
-                if parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'i']:
-                    return True
-                # Also check if parent's parent is a formatting tag
-                if parent.parent and parent.parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-                    return True
-        # Skip table of contents
-        if parent:
-            parent_text = str(parent).lower()
-            if any(toc in parent_text for toc in ['table of contents', 'toc-', 'contents']):
-                return True
-        # Skip CTAs and buttons
-        if parent and parent.name in ['button', 'a']:
-            return True
-        # Skip if parent has onclick or other event handlers
-        if parent and parent.attrs:
-            event_handlers = ['onclick', 'onchange', 'onsubmit', 'onload', 'onmouseover', 'onmouseout']
-            if any(handler in parent.attrs for handler in event_handlers):
-                return True
-        # Special check for testimonial cards - check up to 3 levels of ancestors
-        if parent:
-            ancestors_to_check = []
-            current = parent
-            for _ in range(3):  # Check up to 3 levels up
-                if current:
-                    ancestors_to_check.append(current)
-                    current = current.parent
-            # Check if any ancestor has testimonial-card class
-            for ancestor in ancestors_to_check:
-                if ancestor and ancestor.get('class'):
-                    classes = ancestor.get('class', [])
-                    if isinstance(classes, list):
-                        if any('testimonial-card' in str(cls) for cls in classes):
-                            return True
-                    elif isinstance(classes, str) and 'testimonial-card' in classes:
-                        return True
-        # Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs
-        skip_indicators = [
-            'button', 'btn', 'heading', 'title', 'caption',
-            'toc-', 'contents', 'quiz', 'tip', 'note', 'alert',
-            'warning', 'info', 'success', 'error', 'code', 'pre',
-            'stats-grid', 'testimonial-card',
-            'cta-box', 'quiz-container', 'contact-form',
-            'faq-question', 'sidebar', 'widget', 'banner',
-            'author-intro', 'testimonial', 'review', 'feedback',
-            'floating-', 'stat-', 'progress-', 'option', 'results',
-            'question-container', 'quiz-',
-            'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
-        ]
-        # Skip quiz-related elements
-        quiz_indicators = ['quiz-container', 'question-container', 'questionContainer',
-                           'results', 'progressFill', 'currentQuestion', 'totalQuestions']
-        # Check only immediate parent and grandparent (not all ancestors)
-        elements_to_check = [parent]
-        if parent and parent.parent:
-            elements_to_check.append(parent.parent)
-        for elem in elements_to_check:
-            if not elem:
-                continue
-            # Check element's class
-            elem_class = elem.get('class', [])
-            if isinstance(elem_class, list):
-                class_str = ' '.join(str(cls).lower() for cls in elem_class)
-                if any(indicator in class_str for indicator in skip_indicators):
-                    return True
-                if any(indicator in class_str for indicator in quiz_indicators):
-                    return True
-            # Check element's ID
-            elem_id = elem.get('id', '')
-            if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
-                return True
-            if elem_id in quiz_indicators:
-                return True
-        # Skip short phrases that might be UI elements
-        word_count = len(text.split())
-        if word_count <= 5:
-            ui_patterns = [
-                'click', 'download', 'learn more', 'read more', 'sign up',
-                'get started', 'try now', 'buy now', 'next', 'previous',
-                'back', 'continue', 'submit', 'cancel', 'get now', 'book your',
-                'check out:', 'see also:', 'related:', 'question', 'of'
-            ]
-            if any(pattern in text.lower() for pattern in ui_patterns):
-                return True
-        # Skip very short content in styled containers
-        if parent and parent.name in ['div', 'section', 'aside', 'blockquote']:
-            style = parent.get('style', '')
-            if 'border' in style or 'background' in style:
-                if word_count <= 20:
-                    # But don't skip if it's inside a paragraph
-                    if not any(p.name == 'p' for p in parent.parents):
-                        return True
-        return False
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""
@@ -1350,25 +1322,12 @@ class EnhancedDipperHumanizer:
         soup = BeautifulSoup(html_content, 'html.parser')
         text_elements = []
-        # Get all elements with onclick, onchange, etc.
-        elements_with_handlers = soup.find_all(attrs=lambda x: x and any(k.startswith('on') for k in x.keys()))
-        # Mark these elements to skip their text content
-        skip_elements = set()
-        for elem in elements_with_handlers:
-            skip_elements.add(elem)
-            skip_elements.update(elem.descendants)
         # Get all text nodes using string instead of text (fixing deprecation)
         for element in soup.find_all(string=True):
             # Skip script, style, and noscript content completely
             if element.parent.name in ['script', 'style', 'noscript']:
                 continue
-            # Skip if element or any parent has event handlers
-            if any(parent in skip_elements for parent in element.parents):
-                continue
             text = element.strip()
             if text and not self.should_skip_element(element, text):
                 text_elements.append({
@@ -1379,21 +1338,42 @@ class EnhancedDipperHumanizer:
         return soup, text_elements
     def validate_and_fix_html(self, html_text):
-        """Fix common HTML syntax errors after processing"""
-        # Fix DOCTYPE
-        html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
-        # Fix spacing issues
-        html_text = re.sub(r'>\s+<', '><', html_text)  # Remove extra spaces between tags
-        html_text = re.sub(r'\s+>', '>', html_text)  # Remove spaces before closing >
-        html_text = re.sub(r'<\s+', '<', html_text)  # Remove spaces after opening
-        # Fix common word errors that might occur during processing
-        html_text = html_text.replace('down loaded', 'downloaded')
-        html_text = html_text.replace('But your document', 'Your document')
-        return html_text
     def add_natural_flow_variations(self, text):
         """Add more natural flow and rhythm variations for Originality AI"""
@@ -1468,23 +1448,23 @@ class EnhancedDipperHumanizer:
         preserved_scripts = []
         preserved_styles = []
-        # Parse HTML
-        soup = BeautifulSoup(html_content, 'html.parser')
-        # Find and preserve all script tags WITH their content
-        for idx, script in enumerate(soup.find_all('script')):
-            placeholder = BeautifulSoup(f'<div>{script_placeholder.format(idx)}</div>', 'html.parser').div
             preserved_scripts.append(str(script))
             script.replace_with(placeholder)
         # Preserve all style tags
-        for idx, style in enumerate(soup.find_all('style')):
-            placeholder = BeautifulSoup(f'<div>{style_placeholder.format(idx)}</div>', 'html.parser').div
             preserved_styles.append(str(style))
             style.replace_with(placeholder)
         # Get the modified HTML
-        html_content = str(soup)
         try:
             # Extract text elements
@@ -1548,12 +1528,12 @@ class EnhancedDipperHumanizer:
             # Restore all script tags
             for idx, script_content in enumerate(preserved_scripts):
                 placeholder = script_placeholder.format(idx)
-                result = result.replace(f'<div>{placeholder}</div>', script_content)
             # Restore all style tags
             for idx, style_content in enumerate(preserved_styles):
                 placeholder = style_placeholder.format(idx)
-                result = result.replace(f'<div>{placeholder}</div>', style_content)
             # Post-process the entire HTML to fix bold/strong formatting
             result = self.post_process_html(result)

         return text
     def should_skip_element(self, element, text):
+    """Determine if an element should be skipped from paraphrasing"""
+    if not text or len(text.strip()) < 3:
+        return True
+    # Skip JavaScript code inside script tags - CRITICAL FIX
+    parent = element.parent
+    if parent and parent.name in ['script', 'style', 'noscript']:
+        return True
+    # Also check if we're inside a script tag at any level
+    for ancestor in element.parents:
+        if ancestor.name in ['script', 'style', 'noscript']:
             return True
+    # Rest of your existing skip logic...
+    return False
+def extract_text_from_html(self, html_content):
+    """Extract text elements from HTML with skip logic"""
+    soup = BeautifulSoup(html_content, 'html.parser')
+    text_elements = []
+    # CRITICAL: Preserve all script tags completely
+    script_tags = soup.find_all('script')
+    script_placeholders = {}
+    for i, script in enumerate(script_tags):
+        placeholder = f"###SCRIPT_CONTENT_{i}###"
+        script_placeholders[placeholder] = str(script)
+        script.string = placeholder
+    # Get all text nodes
+    for element in soup.find_all(string=True):
+        # Skip script, style, and noscript content completely
+        if element.parent.name in ['script', 'style', 'noscript']:
+            continue
+        # Skip if it's a script placeholder
+        text = element.strip()
+        if text.startswith("###SCRIPT_CONTENT_") and text.endswith("###"):
+            continue
+        if text and not self.should_skip_element(element, text):
+            text_elements.append({
+                'text': text,
+                'element': element
+            })
+    return soup, text_elements, script_placeholders
+def process_html(self, html_content, progress_callback=None):
+    """Main processing function with progress callback"""
+    if not html_content.strip():
+        return "Please provide HTML content."
+    try:
+        # Extract text elements with script preservation
+        soup, text_elements, script_placeholders = self.extract_text_from_html(html_content)
+        total_elements = len(text_elements)
+        print(f"Found {total_elements} text elements to process (after filtering)")
+        # Process each text element
+        processed_count = 0
+        for i, element_info in enumerate(text_elements):
+            original_text = element_info['text']
+            # Skip placeholders
+            if "###SCRIPT_" in original_text:
+                continue
+            # Skip very short texts
+            if len(original_text.split()) < 3:
+                continue
+            # Process the text with your existing logic
+            paraphrased_text = self.paraphrase_with_dipper(
+                original_text,
+                lex_diversity=60,
+                order_diversity=20
+            )
+            # Apply other transformations...
+            paraphrased_text = self.apply_sentence_variation(paraphrased_text)
+            paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
+            paraphrased_text = self.fix_punctuation(paraphrased_text)
+            # Final quality check
+            if paraphrased_text and len(paraphrased_text.split()) >= 3:
+                element_info['element'].replace_with(NavigableString(paraphrased_text))
+                processed_count += 1
+            # Progress update
+            if progress_callback:
+                progress_callback(i + 1, total_elements)
+        # Get the processed HTML
+        result_html = str(soup)
+        # CRITICAL: Restore all script content exactly as it was
+        for placeholder, original_script in script_placeholders.items():
+            result_html = result_html.replace(f"<script>{placeholder}</script>", original_script)
+        # Post-process the entire HTML
+        result_html = self.post_process_html(result_html)
+        result_html = self.validate_and_fix_html(result_html)
+        print(f"Successfully processed {processed_count} text elements")
+        return result_html
+    except Exception as e:
+        import traceback
+        error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return f"<!-- {error_msg} -->\n{html_content}"
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""
         soup = BeautifulSoup(html_content, 'html.parser')
         text_elements = []
         # Get all text nodes using string instead of text (fixing deprecation)
         for element in soup.find_all(string=True):
             # Skip script, style, and noscript content completely
             if element.parent.name in ['script', 'style', 'noscript']:
                 continue
             text = element.strip()
             if text and not self.should_skip_element(element, text):
                 text_elements.append({
         return soup, text_elements
     def validate_and_fix_html(self, html_text):
+    """Fix common HTML syntax errors after processing"""
+    # First, protect script content
+    script_pattern = r'<script[^>]*>(.*?)</script>'
+    scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
+    script_placeholders = {}
+    for i, script_content in enumerate(scripts):
+        placeholder = f"<!--SCRIPT_PLACEHOLDER_{i}-->"
+        script_placeholders[placeholder] = script_content
+        html_text = html_text.replace(
+            f'<script>{script_content}</script>',
+            f'<script>{placeholder}</script>',
+            1
+        )
+    # Fix DOCTYPE
+    html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
+    # Fix spacing issues (but not inside scripts)
+    html_text = re.sub(r'>\s+<', '><', html_text)
+    html_text = re.sub(r'\s+>', '>', html_text)
+    html_text = re.sub(r'<\s+', '<', html_text)
+    # Fix common word errors that might occur during processing
+    html_text = html_text.replace('down loaded', 'downloaded')
+    html_text = html_text.replace('But your document', 'Your document')
+    # Restore script content
+    for placeholder, script_content in script_placeholders.items():
+        html_text = html_text.replace(
+            f'<script>{placeholder}</script>',
+            f'<script>{script_content}</script>'
+        )
+    return html_text
     def add_natural_flow_variations(self, text):
         """Add more natural flow and rhythm variations for Originality AI"""
         preserved_scripts = []
         preserved_styles = []
+        # Temporarily replace script and style tags with placeholders
+        soup_temp = BeautifulSoup(html_content, 'html.parser')
+        # Preserve all script tags
+        for idx, script in enumerate(soup_temp.find_all('script')):
+            placeholder = script_placeholder.format(idx)
             preserved_scripts.append(str(script))
             script.replace_with(placeholder)
         # Preserve all style tags
+        for idx, style in enumerate(soup_temp.find_all('style')):
+            placeholder = style_placeholder.format(idx)
             preserved_styles.append(str(style))
             style.replace_with(placeholder)
         # Get the modified HTML
+        html_content = str(soup_temp)
         try:
             # Extract text elements
             # Restore all script tags
             for idx, script_content in enumerate(preserved_scripts):
                 placeholder = script_placeholder.format(idx)
+                result = result.replace(placeholder, script_content)
             # Restore all style tags
             for idx, style_content in enumerate(preserved_styles):
                 placeholder = style_placeholder.format(idx)
+                result = result.replace(placeholder, style_content)
             # Post-process the entire HTML to fix bold/strong formatting
             result = self.post_process_html(result)