Spaces:
Sleeping
Sleeping
File size: 727 Bytes
5d4981c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
import re
def remove_html_tags(text):
# Replace common HTML entities with their corresponding characters
text = text.replace('"', '"') # Replace "
text = text.replace('"', '"') # Also replace the named entity for "
text = text.replace(''', "'") # Replace '
text = text.replace(''', "'") # Also replace the numeric entity for '
text = text.replace('&', '&') # Replace &
text = text.replace('<br />', ' ') # Replace line breaks with a space
text = text.replace('<br>', ' ') # Also handle <br>
# Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
clean_text = re.sub(r'<[^>]+>', '', text)
return clean_text.lower() |