Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -149,16 +149,20 @@ def clean_text(text):
|
|
| 149 |
# Replace escaped newlines with actual newlines
|
| 150 |
text = text.replace('\\n', '\n')
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
-
#
|
| 161 |
-
|
| 162 |
|
| 163 |
# Split the text into paragraphs
|
| 164 |
paragraphs = text.split('\n\n')
|
|
@@ -168,7 +172,7 @@ def clean_text(text):
|
|
| 168 |
lines = paragraph.split('\n')
|
| 169 |
cleaned_lines = []
|
| 170 |
for line in lines:
|
| 171 |
-
# Preserve
|
| 172 |
if line.strip().startswith('**') and line.strip().endswith('**'):
|
| 173 |
cleaned_line = line.strip()
|
| 174 |
else:
|
|
@@ -176,7 +180,7 @@ def clean_text(text):
|
|
| 176 |
cleaned_line = re.sub(r'\*|\−|\∗', '', line)
|
| 177 |
cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
|
| 178 |
|
| 179 |
-
# Handle bullet points
|
| 180 |
if cleaned_line.strip().startswith('-'):
|
| 181 |
cleaned_line = '\n' + cleaned_line.strip()
|
| 182 |
|
|
|
|
| 149 |
# Replace escaped newlines with actual newlines
|
| 150 |
text = text.replace('\\n', '\n')
|
| 151 |
|
| 152 |
+
# Convert <a> tags to Markdown links
|
| 153 |
+
def convert_links(match):
|
| 154 |
+
url = match.group(1)
|
| 155 |
+
link_text = match.group(2)
|
| 156 |
+
return f"[{link_text}]({url})"
|
| 157 |
+
|
| 158 |
+
# Handle <a> tags to preserve clickable URLs in Markdown format
|
| 159 |
+
text = re.sub(r'<a [^>]*href="([^"]+)"[^>]*>(.*?)</a>', convert_links, text)
|
| 160 |
+
|
| 161 |
+
# Remove <span>, <i>, <b>, and other unwanted HTML tags
|
| 162 |
+
text = re.sub(r'<span[^>]*>|</span>|<i[^>]*>|</i>|<b[^>]*>|</b>', '', text)
|
| 163 |
|
| 164 |
+
# Remove any remaining HTML tags
|
| 165 |
+
text = re.sub(r'<[^>]+>', '', text)
|
| 166 |
|
| 167 |
# Split the text into paragraphs
|
| 168 |
paragraphs = text.split('\n\n')
|
|
|
|
| 172 |
lines = paragraph.split('\n')
|
| 173 |
cleaned_lines = []
|
| 174 |
for line in lines:
|
| 175 |
+
# Preserve and correctly format headings or bold text
|
| 176 |
if line.strip().startswith('**') and line.strip().endswith('**'):
|
| 177 |
cleaned_line = line.strip()
|
| 178 |
else:
|
|
|
|
| 180 |
cleaned_line = re.sub(r'\*|\−|\∗', '', line)
|
| 181 |
cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
|
| 182 |
|
| 183 |
+
# Handle bullet points correctly
|
| 184 |
if cleaned_line.strip().startswith('-'):
|
| 185 |
cleaned_line = '\n' + cleaned_line.strip()
|
| 186 |
|