Copy-AI

Build error

App Files Files Community

Wajahat698 commited on Sep 8, 2024

Commit

19a21e9

verified ·

1 Parent(s): 01548c9

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -20

app.py CHANGED Viewed

@@ -159,31 +159,37 @@ def send_feedback_via_email(name, email, feedback):
 def clean_text(text):
     # Replace newline escape sequences with actual newlines
     text = text.replace('\\n', '\n')
-    # Use BeautifulSoup to parse and remove all HTML tags
     soup = BeautifulSoup(text, "html.parser")
-    text = soup.get_text()
     # Remove unwanted asterisks and special characters
-    text = re.sub(r'[\*−∗]', '', text)
-    # Fix numbers adjacent to letters and units
-    text = re.sub(r'(\d+)\s*(B|M|T|billion|million|trillion)', lambda m: f"{m.group(1)} {m.group(2)}", text)
-    text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', text)  # Fix numbers next to letters
-    text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', text)  # Fix broken numbers and words
-    # Remove any leftover HTML or math fragments
-    text = re.sub(r'<span[^>]*class="(mathnormal|mord)"[^>]*>.*?</span>', '', text, flags=re.DOTALL)
     # Split text into paragraphs based on double newlines
-    paragraphs = text.split('\n\n')
     cleaned_paragraphs = []
     for paragraph in paragraphs:
         lines = paragraph.split('\n')
         cleaned_lines = []
         for line in lines:
             # Preserve headings if surrounded by double asterisks (indicating bold text)
             if line.strip().startswith('**') and line.strip().endswith('**'):
@@ -201,17 +207,18 @@ def clean_text(text):
             # Remove extra spaces
             cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
             cleaned_lines.append(cleaned_line)
         # Join the cleaned lines within each paragraph
         cleaned_paragraph = '\n'.join(cleaned_lines)
         cleaned_paragraphs.append(cleaned_paragraph)
     # Join cleaned paragraphs back with double newlines
-    cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
-    return cleaned_text
 def get_trust_tip_and_suggestion():
     trust_tip = random.choice(trust_tips)

 def clean_text(text):
     # Replace newline escape sequences with actual newlines
     text = text.replace('\\n', '\n')
+    # Use BeautifulSoup to parse and remove all HTML tags, including problematic spans
     soup = BeautifulSoup(text, "html.parser")
+    # Remove problematic span tags by identifying classes (e.g., 'mord', 'mathnormal')
+    for span in soup.find_all('span', {'class': ['mord', 'mathnormal']}):
+        span.decompose()  # Completely remove the span element
+    # Get cleaned text from BeautifulSoup object (removing any remaining HTML tags)
+    cleaned_text = soup.get_text()
     # Remove unwanted asterisks and special characters
+    cleaned_text = re.sub(r'[\*−∗]', '', cleaned_text)
+    # Fix numbers adjacent to letters and units (e.g., 10B -> 10 B)
+    cleaned_text = re.sub(r'(\d+)\s*(B|M|T|billion|million|trillion)', lambda m: f"{m.group(1)} {m.group(2)}", cleaned_text)
+    cleaned_text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', cleaned_text)  # Fix numbers next to letters
+    cleaned_text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', cleaned_text)  # Fix broken numbers and words
+    # Remove any leftover HTML or math fragments from problematic tags
+    cleaned_text = re.sub(r'<span[^>]*class="(mathnormal|mord)"[^>]*>.*?</span>', '', cleaned_text, flags=re.DOTALL)
     # Split text into paragraphs based on double newlines
+    paragraphs = cleaned_text.split('\n\n')
     cleaned_paragraphs = []
     for paragraph in paragraphs:
         lines = paragraph.split('\n')
         cleaned_lines = []
         for line in lines:
             # Preserve headings if surrounded by double asterisks (indicating bold text)
             if line.strip().startswith('**') and line.strip().endswith('**'):
             # Remove extra spaces
             cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
             cleaned_lines.append(cleaned_line)
         # Join the cleaned lines within each paragraph
         cleaned_paragraph = '\n'.join(cleaned_lines)
         cleaned_paragraphs.append(cleaned_paragraph)
     # Join cleaned paragraphs back with double newlines
+    final_cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
+    return final_cleaned_text
 def get_trust_tip_and_suggestion():
     trust_tip = random.choice(trust_tips)