Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -160,14 +160,14 @@ def clean_text(text):
|
|
| 160 |
# Replace newline escape sequences with actual newlines
|
| 161 |
text = text.replace('\\n', '\n')
|
| 162 |
|
| 163 |
-
# Use BeautifulSoup to parse
|
| 164 |
soup = BeautifulSoup(text, "html.parser")
|
| 165 |
-
|
| 166 |
-
# Remove
|
| 167 |
for span in soup.find_all('span', {'class': ['mord', 'mathnormal']}):
|
| 168 |
-
span.decompose() #
|
| 169 |
|
| 170 |
-
# Get cleaned text
|
| 171 |
cleaned_text = soup.get_text()
|
| 172 |
|
| 173 |
# Remove unwanted asterisks and special characters
|
|
@@ -178,46 +178,28 @@ def clean_text(text):
|
|
| 178 |
cleaned_text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', cleaned_text) # Fix numbers next to letters
|
| 179 |
cleaned_text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', cleaned_text) # Fix broken numbers and words
|
| 180 |
|
| 181 |
-
#
|
| 182 |
-
cleaned_text = re.sub(r'<span[^>]*class="(mathnormal|mord)"[^>]*>.*?</span>', '', cleaned_text, flags=re.DOTALL)
|
| 183 |
-
|
| 184 |
-
# Split text into paragraphs based on double newlines
|
| 185 |
paragraphs = cleaned_text.split('\n\n')
|
| 186 |
-
|
| 187 |
-
cleaned_paragraphs = []
|
| 188 |
|
|
|
|
| 189 |
for paragraph in paragraphs:
|
| 190 |
lines = paragraph.split('\n')
|
| 191 |
cleaned_lines = []
|
| 192 |
-
|
| 193 |
for line in lines:
|
| 194 |
-
#
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
else:
|
| 198 |
-
# Remove unwanted asterisks and special characters
|
| 199 |
-
cleaned_line = re.sub(r'[\*−∗]', '', line)
|
| 200 |
-
|
| 201 |
-
# Separate merged words (e.g., "HelloWorld" -> "Hello World")
|
| 202 |
-
cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
|
| 203 |
-
|
| 204 |
-
# Handle bullet points
|
| 205 |
-
if cleaned_line.strip().startswith('-'):
|
| 206 |
-
cleaned_line = '\n' + cleaned_line.strip()
|
| 207 |
-
|
| 208 |
-
# Remove extra spaces
|
| 209 |
-
cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
|
| 210 |
-
|
| 211 |
cleaned_lines.append(cleaned_line)
|
| 212 |
|
| 213 |
-
# Join
|
| 214 |
cleaned_paragraph = '\n'.join(cleaned_lines)
|
| 215 |
cleaned_paragraphs.append(cleaned_paragraph)
|
| 216 |
|
| 217 |
-
# Join cleaned paragraphs back
|
| 218 |
final_cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
|
| 219 |
-
|
| 220 |
return final_cleaned_text
|
|
|
|
| 221 |
|
| 222 |
|
| 223 |
def get_trust_tip_and_suggestion():
|
|
@@ -670,5 +652,5 @@ if prompt :
|
|
| 670 |
|
| 671 |
# Add AI response to chat history
|
| 672 |
|
| 673 |
-
st.session_state.chat_history.append({"role": "assistant", "content":
|
| 674 |
copy_to_clipboard(combined_text)
|
|
|
|
| 160 |
# Replace newline escape sequences with actual newlines
|
| 161 |
text = text.replace('\\n', '\n')
|
| 162 |
|
| 163 |
+
# Use BeautifulSoup to parse the HTML and remove span tags
|
| 164 |
soup = BeautifulSoup(text, "html.parser")
|
| 165 |
+
|
| 166 |
+
# Remove all span tags with problematic classes
|
| 167 |
for span in soup.find_all('span', {'class': ['mord', 'mathnormal']}):
|
| 168 |
+
span.decompose() # Remove span entirely
|
| 169 |
|
| 170 |
+
# Get cleaned text without any HTML tags
|
| 171 |
cleaned_text = soup.get_text()
|
| 172 |
|
| 173 |
# Remove unwanted asterisks and special characters
|
|
|
|
| 178 |
cleaned_text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', cleaned_text) # Fix numbers next to letters
|
| 179 |
cleaned_text = re.sub(r'(\d+)\s+([a-zA-Z])', r'\1 \2', cleaned_text) # Fix broken numbers and words
|
| 180 |
|
| 181 |
+
# Split text into paragraphs based on double newlines for readability
|
|
|
|
|
|
|
|
|
|
| 182 |
paragraphs = cleaned_text.split('\n\n')
|
|
|
|
|
|
|
| 183 |
|
| 184 |
+
cleaned_paragraphs = []
|
| 185 |
for paragraph in paragraphs:
|
| 186 |
lines = paragraph.split('\n')
|
| 187 |
cleaned_lines = []
|
|
|
|
| 188 |
for line in lines:
|
| 189 |
+
# Separate merged words (e.g., "HelloWorld" -> "Hello World")
|
| 190 |
+
cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', line)
|
| 191 |
+
cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip() # Remove extra spaces
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
cleaned_lines.append(cleaned_line)
|
| 193 |
|
| 194 |
+
# Join cleaned lines into paragraphs
|
| 195 |
cleaned_paragraph = '\n'.join(cleaned_lines)
|
| 196 |
cleaned_paragraphs.append(cleaned_paragraph)
|
| 197 |
|
| 198 |
+
# Join cleaned paragraphs back into the final cleaned text
|
| 199 |
final_cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
|
| 200 |
+
|
| 201 |
return final_cleaned_text
|
| 202 |
+
|
| 203 |
|
| 204 |
|
| 205 |
def get_trust_tip_and_suggestion():
|
|
|
|
| 652 |
|
| 653 |
# Add AI response to chat history
|
| 654 |
|
| 655 |
+
st.session_state.chat_history.append({"role": "assistant", "content": cleaned_text})
|
| 656 |
copy_to_clipboard(combined_text)
|