Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Aug 4, 2025

Commit

afd7422

verified ·

1 Parent(s): dcfc371

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -56

app.py CHANGED Viewed

@@ -829,71 +829,200 @@ class EnhancedDipperHumanizer:
         return soup, text_elements, script_placeholders
     def process_html(self, html_content, progress_callback=None):
-        """Main processing function with progress callback"""
-        if not html_content.strip():
-            return "Please provide HTML content."
-        try:
-            # Extract text elements with script preservation
-            soup, text_elements, script_placeholders = self.extract_text_from_html(html_content)
-            total_elements = len(text_elements)
-            print(f"Found {total_elements} text elements to process (after filtering)")
-            # Process each text element
-            processed_count = 0
-            for i, element_info in enumerate(text_elements):
-                original_text = element_info['text']
-                # Skip placeholders
-                if "###SCRIPT_" in original_text:
-                    continue
-                # Skip very short texts
-                if len(original_text.split()) < 3:
-                    continue
-                # Process the text with your existing logic
-                paraphrased_text = self.paraphrase_with_dipper(
-                    original_text,
-                    lex_diversity=60,
-                    order_diversity=20
-                )
-                # Apply other transformations...
-                paraphrased_text = self.apply_sentence_variation(paraphrased_text)
-                paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
-                paraphrased_text = self.fix_punctuation(paraphrased_text)
-                # Final quality check
-                if paraphrased_text and len(paraphrased_text.split()) >= 3:
-                    element_info['element'].replace_with(NavigableString(paraphrased_text))
-                    processed_count += 1
-                # Progress update
-                if progress_callback:
-                    progress_callback(i + 1, total_elements)
-            # Get the processed HTML
-            result_html = str(soup)
-            # CRITICAL: Restore all script content exactly as it was
-            for placeholder, original_script in script_placeholders.items():
-                result_html = result_html.replace(f"<script>{placeholder}</script>", original_script)
-            # Post-process the entire HTML
-            result_html = self.post_process_html(result_html)
-            result_html = self.validate_and_fix_html(result_html)
-            print(f"Successfully processed {processed_count} text elements")
-            return result_html
-        except Exception as e:
-            import traceback
-            error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
-            print(error_msg)
-            return f"<!-- {error_msg} -->\n{html_content}"
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""

         return soup, text_elements, script_placeholders
     def process_html(self, html_content, progress_callback=None):
+    """Main processing function with progress callback"""
+    if not html_content.strip():
+        return "Please provide HTML content."
+    # Parse the HTML first
+    soup_initial = BeautifulSoup(html_content, 'html.parser')
+    # Store ALL scripts (both external and inline) with their full content
+    script_storage = []
+    script_placeholder_template = "<!--SCRIPT_PLACEHOLDER_{}-->"
+    # Find and replace all script tags
+    for idx, script in enumerate(soup_initial.find_all('script')):
+        placeholder = script_placeholder_template.format(idx)
+        # Store the entire script tag as a string
+        script_storage.append(str(script))
+        # Replace with a comment placeholder
+        new_tag = soup_initial.new_string(placeholder)
+        script.replace_with(new_tag)
+    # Also store and replace style tags
+    style_storage = []
+    style_placeholder_template = "<!--STYLE_PLACEHOLDER_{}-->"
+    for idx, style in enumerate(soup_initial.find_all('style')):
+        placeholder = style_placeholder_template.format(idx)
+        style_storage.append(str(style))
+        new_tag = soup_initial.new_string(placeholder)
+        style.replace_with(new_tag)
+    # Get the modified HTML
+    html_content = str(soup_initial)
+    try:
+        # Extract text elements
+        soup, text_elements = self.extract_text_from_html(html_content)
+        total_elements = len(text_elements)
+        print(f"Found {total_elements} text elements to process (after filtering)")
+        # Process each text element
+        processed_count = 0
+        for i, element_info in enumerate(text_elements):
+            original_text = element_info['text']
+            # Skip placeholders
+            if "SCRIPT_PLACEHOLDER" in original_text or "STYLE_PLACEHOLDER" in original_text:
+                continue
+            # Skip very short texts
+            if len(original_text.split()) < 3:
+                continue
+            # First pass with Dipper
+            paraphrased_text = self.paraphrase_with_dipper(
+                original_text,
+                lex_diversity=60,
+                order_diversity=20
+            )
+            # Second pass with BART for longer texts (balanced probability)
+            if self.use_bart and len(paraphrased_text.split()) > 8:
+                # 30% chance to use BART for more variation (balanced)
+                if random.random() < 0.3:
+                    paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
+            # Apply sentence variation
+            paraphrased_text = self.apply_sentence_variation(paraphrased_text)
+            # Add natural flow variations
+            paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
+            # Fix punctuation and formatting
+            paraphrased_text = self.fix_punctuation(paraphrased_text)
+            # Final quality check
+            if paraphrased_text and len(paraphrased_text.split()) >= 3:
+                element_info['element'].replace_with(NavigableString(paraphrased_text))
+                processed_count += 1
+            # Progress update
+            if progress_callback:
+                progress_callback(i + 1, total_elements)
+            if i % 10 == 0 or i == total_elements - 1:
+                progress = (i + 1) / total_elements * 100
+                print(f"Progress: {progress:.1f}%")
+        # Get the processed HTML
+        result = str(soup)
+        # Restore all script tags exactly as they were
+        for idx, script_content in enumerate(script_storage):
+            placeholder = script_placeholder_template.format(idx)
+            result = result.replace(placeholder, script_content)
+        # Restore all style tags exactly as they were
+        for idx, style_content in enumerate(style_storage):
+            placeholder = style_placeholder_template.format(idx)
+            result = result.replace(placeholder, style_content)
+        # Post-process the entire HTML to fix bold/strong formatting
+        result = self.post_process_html(result)
+        # Validate and fix HTML syntax (but protect scripts)
+        result = self.validate_and_fix_html_safe(result)
+        print(f"Successfully processed {processed_count} text elements")
+        print(f"Preserved {len(script_storage)} script tags and {len(style_storage)} style tags")
+        return result
+    except Exception as e:
+        import traceback
+        error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        # Return original HTML with error message prepended as HTML comment
+        return f"<!-- {error_msg} -->\n{html_content}"
+def validate_and_fix_html_safe(self, html_text):
+    """Fix common HTML syntax errors after processing while protecting scripts"""
+    # First, extract and protect script content
+    script_pattern = r'<script[^>]*>.*?</script>'
+    scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
+    script_placeholders = {}
+    for i, script_content in enumerate(scripts):
+        placeholder = f"<!--PROTECTED_SCRIPT_{i}-->"
+        script_placeholders[placeholder] = script_content
+        html_text = html_text.replace(script_content, placeholder, 1)
+    # Also protect style tags
+    style_pattern = r'<style[^>]*>.*?</style>'
+    styles = re.findall(style_pattern, html_text, re.DOTALL | re.IGNORECASE)
+    style_placeholders = {}
+    for i, style_content in enumerate(styles):
+        placeholder = f"<!--PROTECTED_STYLE_{i}-->"
+        style_placeholders[placeholder] = style_content
+        html_text = html_text.replace(style_content, placeholder, 1)
+    # Fix DOCTYPE
+    html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
+    # Fix spacing issues (but not inside scripts/styles)
+    html_text = re.sub(r'>\s+<', '><', html_text)
+    html_text = re.sub(r'\s+>', '>', html_text)
+    html_text = re.sub(r'<\s+', '<', html_text)
+    # Fix common word errors that might occur during processing
+    html_text = html_text.replace('down loaded', 'downloaded')
+    html_text = html_text.replace('But your document', 'Your document')
+    # Restore protected scripts
+    for placeholder, script_content in script_placeholders.items():
+        html_text = html_text.replace(placeholder, script_content)
+    # Restore protected styles
+    for placeholder, style_content in style_placeholders.items():
+        html_text = html_text.replace(placeholder, style_content)
+    return html_text
+    def extract_text_from_html(self, html_content):
+    """Extract text elements from HTML with skip logic"""
+    soup = BeautifulSoup(html_content, 'html.parser')
+    text_elements = []
+    # Get all text nodes using strings (the correct method)
+    for element in soup.strings:
+        # Skip if parent is script, style, or noscript
+        if element.parent.name in ['script', 'style', 'noscript']:
+            continue
+        text = element.strip()
+        # Skip empty strings
+        if not text:
+            continue
+        # Skip placeholder texts
+        if "SCRIPT_PLACEHOLDER" in text or "STYLE_PLACEHOLDER" in text:
+            continue
+        # Use the existing should_skip_element logic
+        if not self.should_skip_element(element, text):
+            text_elements.append({
+                'text': text,
+                'element': element
+            })
+    return soup, text_elements
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""