Wajahat698 commited on
Commit
19a21e9
·
verified ·
1 Parent(s): 01548c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -20
app.py CHANGED
@@ -159,31 +159,37 @@ def send_feedback_via_email(name, email, feedback):
159
  def clean_text(text):
160
  # Replace newline escape sequences with actual newlines
161
  text = text.replace('\\n', '\n')
162
-
163
- # Use BeautifulSoup to parse and remove all HTML tags
164
  soup = BeautifulSoup(text, "html.parser")
165
- text = soup.get_text()
 
 
 
 
 
 
166
 
167
  # Remove unwanted asterisks and special characters
168
- text = re.sub(r'[\*−∗]', '', text)
169
-
170
- # Fix numbers adjacent to letters and units
171
- text = re.sub(r'(\d+)\s*(B|M|T|billion|million|trillion)', lambda m: f"{m.group(1)} {m.group(2)}", text)
172
- text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', text) # Fix numbers next to letters
173
- text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', text) # Fix broken numbers and words
 
 
 
174
 
175
- # Remove any leftover HTML or math fragments
176
- text = re.sub(r'<span[^>]*class="(mathnormal|mord)"[^>]*>.*?</span>', '', text, flags=re.DOTALL)
177
-
178
  # Split text into paragraphs based on double newlines
179
- paragraphs = text.split('\n\n')
180
 
181
  cleaned_paragraphs = []
182
-
183
  for paragraph in paragraphs:
184
  lines = paragraph.split('\n')
185
  cleaned_lines = []
186
-
187
  for line in lines:
188
  # Preserve headings if surrounded by double asterisks (indicating bold text)
189
  if line.strip().startswith('**') and line.strip().endswith('**'):
@@ -201,17 +207,18 @@ def clean_text(text):
201
 
202
  # Remove extra spaces
203
  cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
204
-
205
  cleaned_lines.append(cleaned_line)
206
-
207
  # Join the cleaned lines within each paragraph
208
  cleaned_paragraph = '\n'.join(cleaned_lines)
209
  cleaned_paragraphs.append(cleaned_paragraph)
210
-
211
  # Join cleaned paragraphs back with double newlines
212
- cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
213
 
214
- return cleaned_text
 
215
 
216
  def get_trust_tip_and_suggestion():
217
  trust_tip = random.choice(trust_tips)
 
159
  def clean_text(text):
160
  # Replace newline escape sequences with actual newlines
161
  text = text.replace('\\n', '\n')
162
+
163
+ # Use BeautifulSoup to parse and remove all HTML tags, including problematic spans
164
  soup = BeautifulSoup(text, "html.parser")
165
+
166
+ # Remove problematic span tags by identifying classes (e.g., 'mord', 'mathnormal')
167
+ for span in soup.find_all('span', {'class': ['mord', 'mathnormal']}):
168
+ span.decompose() # Completely remove the span element
169
+
170
+ # Get cleaned text from BeautifulSoup object (removing any remaining HTML tags)
171
+ cleaned_text = soup.get_text()
172
 
173
  # Remove unwanted asterisks and special characters
174
+ cleaned_text = re.sub(r'[\*−∗]', '', cleaned_text)
175
+
176
+ # Fix numbers adjacent to letters and units (e.g., 10B -> 10 B)
177
+ cleaned_text = re.sub(r'(\d+)\s*(B|M|T|billion|million|trillion)', lambda m: f"{m.group(1)} {m.group(2)}", cleaned_text)
178
+ cleaned_text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', cleaned_text) # Fix numbers next to letters
179
+ cleaned_text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', cleaned_text) # Fix broken numbers and words
180
+
181
+ # Remove any leftover HTML or math fragments from problematic tags
182
+ cleaned_text = re.sub(r'<span[^>]*class="(mathnormal|mord)"[^>]*>.*?</span>', '', cleaned_text, flags=re.DOTALL)
183
 
 
 
 
184
  # Split text into paragraphs based on double newlines
185
+ paragraphs = cleaned_text.split('\n\n')
186
 
187
  cleaned_paragraphs = []
188
+
189
  for paragraph in paragraphs:
190
  lines = paragraph.split('\n')
191
  cleaned_lines = []
192
+
193
  for line in lines:
194
  # Preserve headings if surrounded by double asterisks (indicating bold text)
195
  if line.strip().startswith('**') and line.strip().endswith('**'):
 
207
 
208
  # Remove extra spaces
209
  cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
210
+
211
  cleaned_lines.append(cleaned_line)
212
+
213
  # Join the cleaned lines within each paragraph
214
  cleaned_paragraph = '\n'.join(cleaned_lines)
215
  cleaned_paragraphs.append(cleaned_paragraph)
216
+
217
  # Join cleaned paragraphs back with double newlines
218
+ final_cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
219
 
220
+ return final_cleaned_text
221
+
222
 
223
  def get_trust_tip_and_suggestion():
224
  trust_tip = random.choice(trust_tips)