Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Aug 4, 2025

Commit

d97439e

verified ·

1 Parent(s): b39d59a

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -11

app.py CHANGED Viewed

@@ -671,6 +671,16 @@ class EnhancedDipperHumanizer:
         parent = element.parent
         if parent and parent.name in ['script', 'style', 'noscript']:
             return True
         # Skip headings (h1-h6)
         if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
@@ -746,6 +756,10 @@ class EnhancedDipperHumanizer:
             'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
         ]
         # Check only immediate parent and grandparent (not all ancestors)
         elements_to_check = [parent]
         if parent and parent.parent:
@@ -761,11 +775,15 @@ class EnhancedDipperHumanizer:
                 class_str = ' '.join(str(cls).lower() for cls in elem_class)
                 if any(indicator in class_str for indicator in skip_indicators):
                     return True
             # Check element's ID
             elem_id = elem.get('id', '')
             if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
                 return True
         # Skip short phrases that might be UI elements
         word_count = len(text.split())
@@ -1332,12 +1350,25 @@ class EnhancedDipperHumanizer:
         soup = BeautifulSoup(html_content, 'html.parser')
         text_elements = []
         # Get all text nodes using string instead of text (fixing deprecation)
         for element in soup.find_all(string=True):
             # Skip script, style, and noscript content completely
             if element.parent.name in ['script', 'style', 'noscript']:
                 continue
             text = element.strip()
             if text and not self.should_skip_element(element, text):
                 text_elements.append({
@@ -1356,7 +1387,7 @@ class EnhancedDipperHumanizer:
         # Fix spacing issues
         html_text = re.sub(r'>\s+<', '><', html_text)  # Remove extra spaces between tags
         html_text = re.sub(r'\s+>', '>', html_text)  # Remove spaces before closing >
-        html_text = re.sub(r'<\s+', '<', html_text)  # Remove spaces after opening <
         # Fix common word errors that might occur during processing
         html_text = html_text.replace('down loaded', 'downloaded')
@@ -1437,23 +1468,23 @@ class EnhancedDipperHumanizer:
         preserved_scripts = []
         preserved_styles = []
-        # Temporarily replace script and style tags with placeholders
-        soup_temp = BeautifulSoup(html_content, 'html.parser')
-        # Preserve all script tags
-        for idx, script in enumerate(soup_temp.find_all('script')):
-            placeholder = script_placeholder.format(idx)
             preserved_scripts.append(str(script))
             script.replace_with(placeholder)
         # Preserve all style tags
-        for idx, style in enumerate(soup_temp.find_all('style')):
-            placeholder = style_placeholder.format(idx)
             preserved_styles.append(str(style))
             style.replace_with(placeholder)
         # Get the modified HTML
-        html_content = str(soup_temp)
         try:
             # Extract text elements
@@ -1517,12 +1548,12 @@ class EnhancedDipperHumanizer:
             # Restore all script tags
             for idx, script_content in enumerate(preserved_scripts):
                 placeholder = script_placeholder.format(idx)
-                result = result.replace(placeholder, script_content)
             # Restore all style tags
             for idx, style_content in enumerate(preserved_styles):
                 placeholder = style_placeholder.format(idx)
-                result = result.replace(placeholder, style_content)
             # Post-process the entire HTML to fix bold/strong formatting
             result = self.post_process_html(result)

         parent = element.parent
         if parent and parent.name in ['script', 'style', 'noscript']:
             return True
+        # Skip inline JavaScript (onclick, onchange, etc.)
+        if parent and parent.attrs:
+            for attr_name, attr_value in parent.attrs.items():
+                if attr_name.startswith('on') and 'selectOption' in str(attr_value):
+                    return True
+        # Skip if text contains JavaScript function calls
+        if 'function' in text or 'selectOption' in text or '=>' in text:
+            return True
         # Skip headings (h1-h6)
         if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
             'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
         ]
+        # Skip quiz-related elements
+        quiz_indicators = ['quiz-container', 'question-container', 'questionContainer',
+                           'results', 'progressFill', 'currentQuestion', 'totalQuestions']
         # Check only immediate parent and grandparent (not all ancestors)
         elements_to_check = [parent]
         if parent and parent.parent:
                 class_str = ' '.join(str(cls).lower() for cls in elem_class)
                 if any(indicator in class_str for indicator in skip_indicators):
                     return True
+                if any(indicator in class_str for indicator in quiz_indicators):
+                    return True
             # Check element's ID
             elem_id = elem.get('id', '')
             if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
                 return True
+            if elem_id in quiz_indicators:
+                return True
         # Skip short phrases that might be UI elements
         word_count = len(text.split())
         soup = BeautifulSoup(html_content, 'html.parser')
         text_elements = []
+        # Get all elements with onclick, onchange, etc.
+        elements_with_handlers = soup.find_all(attrs=lambda x: x and any(k.startswith('on') for k in x.keys()))
+        # Mark these elements to skip their text content
+        skip_elements = set()
+        for elem in elements_with_handlers:
+            skip_elements.add(elem)
+            skip_elements.update(elem.descendants)
         # Get all text nodes using string instead of text (fixing deprecation)
         for element in soup.find_all(string=True):
             # Skip script, style, and noscript content completely
             if element.parent.name in ['script', 'style', 'noscript']:
                 continue
+            # Skip if element or any parent has event handlers
+            if any(parent in skip_elements for parent in element.parents):
+                continue
             text = element.strip()
             if text and not self.should_skip_element(element, text):
                 text_elements.append({
         # Fix spacing issues
         html_text = re.sub(r'>\s+<', '><', html_text)  # Remove extra spaces between tags
         html_text = re.sub(r'\s+>', '>', html_text)  # Remove spaces before closing >
+        html_text = re.sub(r'<\s+', '<', html_text)  # Remove spaces after opening
         # Fix common word errors that might occur during processing
         html_text = html_text.replace('down loaded', 'downloaded')
         preserved_scripts = []
         preserved_styles = []
+        # Parse HTML
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Find and preserve all script tags WITH their content
+        for idx, script in enumerate(soup.find_all('script')):
+            placeholder = BeautifulSoup(f'<div>{script_placeholder.format(idx)}</div>', 'html.parser').div
             preserved_scripts.append(str(script))
             script.replace_with(placeholder)
         # Preserve all style tags
+        for idx, style in enumerate(soup.find_all('style')):
+            placeholder = BeautifulSoup(f'<div>{style_placeholder.format(idx)}</div>', 'html.parser').div
             preserved_styles.append(str(style))
             style.replace_with(placeholder)
         # Get the modified HTML
+        html_content = str(soup)
         try:
             # Extract text elements
             # Restore all script tags
             for idx, script_content in enumerate(preserved_scripts):
                 placeholder = script_placeholder.format(idx)
+                result = result.replace(f'<div>{placeholder}</div>', script_content)
             # Restore all style tags
             for idx, style_content in enumerate(preserved_styles):
                 placeholder = style_placeholder.format(idx)
+                result = result.replace(f'<div>{placeholder}</div>', style_content)
             # Post-process the entire HTML to fix bold/strong formatting
             result = self.post_process_html(result)