CopyVersion-FH

Paused

App Files Files Community

Wajahat698 commited on Sep 8, 2024

Commit

24398ed

verified ·

1 Parent(s): 8a84ab8

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -33

app.py CHANGED Viewed

@@ -160,14 +160,14 @@ def clean_text(text):
     # Replace newline escape sequences with actual newlines
     text = text.replace('\\n', '\n')
-    # Use BeautifulSoup to parse and remove all HTML tags, including problematic spans
     soup = BeautifulSoup(text, "html.parser")
-    # Remove problematic span tags by identifying classes (e.g., 'mord', 'mathnormal')
     for span in soup.find_all('span', {'class': ['mord', 'mathnormal']}):
-        span.decompose()  # Completely remove the span element
-    # Get cleaned text from BeautifulSoup object (removing any remaining HTML tags)
     cleaned_text = soup.get_text()
     # Remove unwanted asterisks and special characters
@@ -178,46 +178,28 @@ def clean_text(text):
     cleaned_text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', cleaned_text)  # Fix numbers next to letters
     cleaned_text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', cleaned_text)  # Fix broken numbers and words
-    # Remove any leftover HTML or math fragments from problematic tags
-    cleaned_text = re.sub(r'<span[^>]*class="(mathnormal|mord)"[^>]*>.*?</span>', '', cleaned_text, flags=re.DOTALL)
-    # Split text into paragraphs based on double newlines
     paragraphs = cleaned_text.split('\n\n')
-    cleaned_paragraphs = []
     for paragraph in paragraphs:
         lines = paragraph.split('\n')
         cleaned_lines = []
         for line in lines:
-            # Preserve headings if surrounded by double asterisks (indicating bold text)
-            if line.strip().startswith('**') and line.strip().endswith('**'):
-                cleaned_line = line.strip()
-            else:
-                # Remove unwanted asterisks and special characters
-                cleaned_line = re.sub(r'[\*−∗]', '', line)
-                # Separate merged words (e.g., "HelloWorld" -> "Hello World")
-                cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
-            # Handle bullet points
-            if cleaned_line.strip().startswith('-'):
-                cleaned_line = '\n' + cleaned_line.strip()
-            # Remove extra spaces
-            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
             cleaned_lines.append(cleaned_line)
-        # Join the cleaned lines within each paragraph
         cleaned_paragraph = '\n'.join(cleaned_lines)
         cleaned_paragraphs.append(cleaned_paragraph)
-    # Join cleaned paragraphs back with double newlines
     final_cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
     return final_cleaned_text
 def get_trust_tip_and_suggestion():
@@ -670,5 +652,5 @@ if prompt :
         # Add AI response to chat history
-        st.session_state.chat_history.append({"role": "assistant", "content": combined_text})
         copy_to_clipboard(combined_text)

     # Replace newline escape sequences with actual newlines
     text = text.replace('\\n', '\n')
+    # Use BeautifulSoup to parse the HTML and remove span tags
     soup = BeautifulSoup(text, "html.parser")
+    # Remove all span tags with problematic classes
     for span in soup.find_all('span', {'class': ['mord', 'mathnormal']}):
+        span.decompose()  # Remove span entirely
+    # Get cleaned text without any HTML tags
     cleaned_text = soup.get_text()
     # Remove unwanted asterisks and special characters
     cleaned_text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', cleaned_text)  # Fix numbers next to letters
     cleaned_text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', cleaned_text)  # Fix broken numbers and words
+    # Split text into paragraphs based on double newlines for readability
     paragraphs = cleaned_text.split('\n\n')
+    cleaned_paragraphs = []
     for paragraph in paragraphs:
         lines = paragraph.split('\n')
         cleaned_lines = []
         for line in lines:
+            # Separate merged words (e.g., "HelloWorld" -> "Hello World")
+            cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', line)
+            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()  # Remove extra spaces
             cleaned_lines.append(cleaned_line)
+        # Join cleaned lines into paragraphs
         cleaned_paragraph = '\n'.join(cleaned_lines)
         cleaned_paragraphs.append(cleaned_paragraph)
+    # Join cleaned paragraphs back into the final cleaned text
     final_cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
     return final_cleaned_text
 def get_trust_tip_and_suggestion():
         # Add AI response to chat history
+        st.session_state.chat_history.append({"role": "assistant", "content": cleaned_text})
         copy_to_clipboard(combined_text)