Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -159,31 +159,37 @@ def send_feedback_via_email(name, email, feedback):
|
|
| 159 |
def clean_text(text):
|
| 160 |
# Replace newline escape sequences with actual newlines
|
| 161 |
text = text.replace('\\n', '\n')
|
| 162 |
-
|
| 163 |
-
# Use BeautifulSoup to parse and remove all HTML tags
|
| 164 |
soup = BeautifulSoup(text, "html.parser")
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
# Remove unwanted asterisks and special characters
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
# Fix numbers adjacent to letters and units
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
-
# Remove any leftover HTML or math fragments
|
| 176 |
-
text = re.sub(r'<span[^>]*class="(mathnormal|mord)"[^>]*>.*?</span>', '', text, flags=re.DOTALL)
|
| 177 |
-
|
| 178 |
# Split text into paragraphs based on double newlines
|
| 179 |
-
paragraphs =
|
| 180 |
|
| 181 |
cleaned_paragraphs = []
|
| 182 |
-
|
| 183 |
for paragraph in paragraphs:
|
| 184 |
lines = paragraph.split('\n')
|
| 185 |
cleaned_lines = []
|
| 186 |
-
|
| 187 |
for line in lines:
|
| 188 |
# Preserve headings if surrounded by double asterisks (indicating bold text)
|
| 189 |
if line.strip().startswith('**') and line.strip().endswith('**'):
|
|
@@ -201,17 +207,18 @@ def clean_text(text):
|
|
| 201 |
|
| 202 |
# Remove extra spaces
|
| 203 |
cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
|
| 204 |
-
|
| 205 |
cleaned_lines.append(cleaned_line)
|
| 206 |
-
|
| 207 |
# Join the cleaned lines within each paragraph
|
| 208 |
cleaned_paragraph = '\n'.join(cleaned_lines)
|
| 209 |
cleaned_paragraphs.append(cleaned_paragraph)
|
| 210 |
-
|
| 211 |
# Join cleaned paragraphs back with double newlines
|
| 212 |
-
|
| 213 |
|
| 214 |
-
return
|
|
|
|
| 215 |
|
| 216 |
def get_trust_tip_and_suggestion():
|
| 217 |
trust_tip = random.choice(trust_tips)
|
|
|
|
| 159 |
def clean_text(text):
|
| 160 |
# Replace newline escape sequences with actual newlines
|
| 161 |
text = text.replace('\\n', '\n')
|
| 162 |
+
|
| 163 |
+
# Use BeautifulSoup to parse and remove all HTML tags, including problematic spans
|
| 164 |
soup = BeautifulSoup(text, "html.parser")
|
| 165 |
+
|
| 166 |
+
# Remove problematic span tags by identifying classes (e.g., 'mord', 'mathnormal')
|
| 167 |
+
for span in soup.find_all('span', {'class': ['mord', 'mathnormal']}):
|
| 168 |
+
span.decompose() # Completely remove the span element
|
| 169 |
+
|
| 170 |
+
# Get cleaned text from BeautifulSoup object (removing any remaining HTML tags)
|
| 171 |
+
cleaned_text = soup.get_text()
|
| 172 |
|
| 173 |
# Remove unwanted asterisks and special characters
|
| 174 |
+
cleaned_text = re.sub(r'[\*−∗]', '', cleaned_text)
|
| 175 |
+
|
| 176 |
+
# Fix numbers adjacent to letters and units (e.g., 10B -> 10 B)
|
| 177 |
+
cleaned_text = re.sub(r'(\d+)\s*(B|M|T|billion|million|trillion)', lambda m: f"{m.group(1)} {m.group(2)}", cleaned_text)
|
| 178 |
+
cleaned_text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', cleaned_text) # Fix numbers next to letters
|
| 179 |
+
cleaned_text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', cleaned_text) # Fix broken numbers and words
|
| 180 |
+
|
| 181 |
+
# Remove any leftover HTML or math fragments from problematic tags
|
| 182 |
+
cleaned_text = re.sub(r'<span[^>]*class="(mathnormal|mord)"[^>]*>.*?</span>', '', cleaned_text, flags=re.DOTALL)
|
| 183 |
|
|
|
|
|
|
|
|
|
|
| 184 |
# Split text into paragraphs based on double newlines
|
| 185 |
+
paragraphs = cleaned_text.split('\n\n')
|
| 186 |
|
| 187 |
cleaned_paragraphs = []
|
| 188 |
+
|
| 189 |
for paragraph in paragraphs:
|
| 190 |
lines = paragraph.split('\n')
|
| 191 |
cleaned_lines = []
|
| 192 |
+
|
| 193 |
for line in lines:
|
| 194 |
# Preserve headings if surrounded by double asterisks (indicating bold text)
|
| 195 |
if line.strip().startswith('**') and line.strip().endswith('**'):
|
|
|
|
| 207 |
|
| 208 |
# Remove extra spaces
|
| 209 |
cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
|
| 210 |
+
|
| 211 |
cleaned_lines.append(cleaned_line)
|
| 212 |
+
|
| 213 |
# Join the cleaned lines within each paragraph
|
| 214 |
cleaned_paragraph = '\n'.join(cleaned_lines)
|
| 215 |
cleaned_paragraphs.append(cleaned_paragraph)
|
| 216 |
+
|
| 217 |
# Join cleaned paragraphs back with double newlines
|
| 218 |
+
final_cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
|
| 219 |
|
| 220 |
+
return final_cleaned_text
|
| 221 |
+
|
| 222 |
|
| 223 |
def get_trust_tip_and_suggestion():
|
| 224 |
trust_tip = random.choice(trust_tips)
|