Wajahat698 commited on
Commit
6d91132
·
verified ·
1 Parent(s): 5be33ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -11
app.py CHANGED
@@ -149,16 +149,20 @@ def clean_text(text):
149
  # Replace escaped newlines with actual newlines
150
  text = text.replace('\\n', '\n')
151
 
152
- # Remove any span and italic tags
153
- text = re.sub(r'<span[^>]*>', '', text)
154
- text = re.sub(r'</span>', '', text)
155
- text = re.sub(r'<i[^>]*>', '', text)
156
- text = re.sub(r'</i>', '', text)
157
-
158
-
 
 
 
 
159
 
160
- # Preserve and correctly format markdown links (don't modify URLs)
161
- #text = re.sub(r'\[([^\]]+)\]\((https?://[^\)]+)\)', r'\1: \2', text)
162
 
163
  # Split the text into paragraphs
164
  paragraphs = text.split('\n\n')
@@ -168,7 +172,7 @@ def clean_text(text):
168
  lines = paragraph.split('\n')
169
  cleaned_lines = []
170
  for line in lines:
171
- # Preserve bold formatting for headings
172
  if line.strip().startswith('**') and line.strip().endswith('**'):
173
  cleaned_line = line.strip()
174
  else:
@@ -176,7 +180,7 @@ def clean_text(text):
176
  cleaned_line = re.sub(r'\*|\−|\∗', '', line)
177
  cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
178
 
179
- # Handle bullet points
180
  if cleaned_line.strip().startswith('-'):
181
  cleaned_line = '\n' + cleaned_line.strip()
182
 
 
149
  # Replace escaped newlines with actual newlines
150
  text = text.replace('\\n', '\n')
151
 
152
+ # Convert <a> tags to Markdown links
153
+ def convert_links(match):
154
+ url = match.group(1)
155
+ link_text = match.group(2)
156
+ return f"[{link_text}]({url})"
157
+
158
+ # Handle <a> tags to preserve clickable URLs in Markdown format
159
+ text = re.sub(r'<a [^>]*href="([^"]+)"[^>]*>(.*?)</a>', convert_links, text)
160
+
161
+ # Remove <span>, <i>, <b>, and other unwanted HTML tags
162
+ text = re.sub(r'<span[^>]*>|</span>|<i[^>]*>|</i>|<b[^>]*>|</b>', '', text)
163
 
164
+ # Remove any remaining HTML tags
165
+ text = re.sub(r'<[^>]+>', '', text)
166
 
167
  # Split the text into paragraphs
168
  paragraphs = text.split('\n\n')
 
172
  lines = paragraph.split('\n')
173
  cleaned_lines = []
174
  for line in lines:
175
+ # Preserve and correctly format headings or bold text
176
  if line.strip().startswith('**') and line.strip().endswith('**'):
177
  cleaned_line = line.strip()
178
  else:
 
180
  cleaned_line = re.sub(r'\*|\−|\∗', '', line)
181
  cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
182
 
183
+ # Handle bullet points correctly
184
  if cleaned_line.strip().startswith('-'):
185
  cleaned_line = '\n' + cleaned_line.strip()
186