Wajahat698 commited on
Commit
24398ed
·
verified ·
1 Parent(s): 8a84ab8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -33
app.py CHANGED
@@ -160,14 +160,14 @@ def clean_text(text):
160
  # Replace newline escape sequences with actual newlines
161
  text = text.replace('\\n', '\n')
162
 
163
- # Use BeautifulSoup to parse and remove all HTML tags, including problematic spans
164
  soup = BeautifulSoup(text, "html.parser")
165
-
166
- # Remove problematic span tags by identifying classes (e.g., 'mord', 'mathnormal')
167
  for span in soup.find_all('span', {'class': ['mord', 'mathnormal']}):
168
- span.decompose() # Completely remove the span element
169
 
170
- # Get cleaned text from BeautifulSoup object (removing any remaining HTML tags)
171
  cleaned_text = soup.get_text()
172
 
173
  # Remove unwanted asterisks and special characters
@@ -178,46 +178,28 @@ def clean_text(text):
178
  cleaned_text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', cleaned_text) # Fix numbers next to letters
179
  cleaned_text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', cleaned_text) # Fix broken numbers and words
180
 
181
- # Remove any leftover HTML or math fragments from problematic tags
182
- cleaned_text = re.sub(r'<span[^>]*class="(mathnormal|mord)"[^>]*>.*?</span>', '', cleaned_text, flags=re.DOTALL)
183
-
184
- # Split text into paragraphs based on double newlines
185
  paragraphs = cleaned_text.split('\n\n')
186
-
187
- cleaned_paragraphs = []
188
 
 
189
  for paragraph in paragraphs:
190
  lines = paragraph.split('\n')
191
  cleaned_lines = []
192
-
193
  for line in lines:
194
- # Preserve headings if surrounded by double asterisks (indicating bold text)
195
- if line.strip().startswith('**') and line.strip().endswith('**'):
196
- cleaned_line = line.strip()
197
- else:
198
- # Remove unwanted asterisks and special characters
199
- cleaned_line = re.sub(r'[\*−∗]', '', line)
200
-
201
- # Separate merged words (e.g., "HelloWorld" -> "Hello World")
202
- cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
203
-
204
- # Handle bullet points
205
- if cleaned_line.strip().startswith('-'):
206
- cleaned_line = '\n' + cleaned_line.strip()
207
-
208
- # Remove extra spaces
209
- cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
210
-
211
  cleaned_lines.append(cleaned_line)
212
 
213
- # Join the cleaned lines within each paragraph
214
  cleaned_paragraph = '\n'.join(cleaned_lines)
215
  cleaned_paragraphs.append(cleaned_paragraph)
216
 
217
- # Join cleaned paragraphs back with double newlines
218
  final_cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
219
-
220
  return final_cleaned_text
 
221
 
222
 
223
  def get_trust_tip_and_suggestion():
@@ -670,5 +652,5 @@ if prompt :
670
 
671
  # Add AI response to chat history
672
 
673
- st.session_state.chat_history.append({"role": "assistant", "content": combined_text})
674
  copy_to_clipboard(combined_text)
 
160
  # Replace newline escape sequences with actual newlines
161
  text = text.replace('\\n', '\n')
162
 
163
+ # Use BeautifulSoup to parse the HTML and remove span tags
164
  soup = BeautifulSoup(text, "html.parser")
165
+
166
+ # Remove all span tags with problematic classes
167
  for span in soup.find_all('span', {'class': ['mord', 'mathnormal']}):
168
+ span.decompose() # Remove span entirely
169
 
170
+ # Get cleaned text without any HTML tags
171
  cleaned_text = soup.get_text()
172
 
173
  # Remove unwanted asterisks and special characters
 
178
  cleaned_text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', cleaned_text) # Fix numbers next to letters
179
  cleaned_text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', cleaned_text) # Fix broken numbers and words
180
 
181
+ # Split text into paragraphs based on double newlines for readability
 
 
 
182
  paragraphs = cleaned_text.split('\n\n')
 
 
183
 
184
+ cleaned_paragraphs = []
185
  for paragraph in paragraphs:
186
  lines = paragraph.split('\n')
187
  cleaned_lines = []
 
188
  for line in lines:
189
+ # Separate merged words (e.g., "HelloWorld" -> "Hello World")
190
+ cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', line)
191
+ cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip() # Remove extra spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  cleaned_lines.append(cleaned_line)
193
 
194
+ # Join cleaned lines into paragraphs
195
  cleaned_paragraph = '\n'.join(cleaned_lines)
196
  cleaned_paragraphs.append(cleaned_paragraph)
197
 
198
+ # Join cleaned paragraphs back into the final cleaned text
199
  final_cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
200
+
201
  return final_cleaned_text
202
+
203
 
204
 
205
  def get_trust_tip_and_suggestion():
 
652
 
653
  # Add AI response to chat history
654
 
655
+ st.session_state.chat_history.append({"role": "assistant", "content": cleaned_text})
656
  copy_to_clipboard(combined_text)