ads505-app / utils /remove_html.py
Taylor Kirk
Fresh deployment after moving datasets to hf datahub
5d4981c
raw
history blame contribute delete
727 Bytes
import re
def remove_html_tags(text):
# Replace common HTML entities with their corresponding characters
text = text.replace('"', '"') # Replace "
text = text.replace('"', '"') # Also replace the named entity for "
text = text.replace(''', "'") # Replace '
text = text.replace(''', "'") # Also replace the numeric entity for '
text = text.replace('&', '&') # Replace &
text = text.replace('<br />', ' ') # Replace line breaks with a space
text = text.replace('<br>', ' ') # Also handle <br>
# Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
clean_text = re.sub(r'<[^>]+>', '', text)
return clean_text.lower()