File size: 727 Bytes
5d4981c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import re

def remove_html_tags(text):
    # Replace common HTML entities with their corresponding characters
    text = text.replace('"', '"')    # Replace "
    text = text.replace('"', '"')   # Also replace the named entity for "
    text = text.replace(''', "'")   # Replace '
    text = text.replace(''', "'")    # Also replace the numeric entity for '
    text = text.replace('&', '&')    # Replace &
    text = text.replace('<br />', ' ')   # Replace line breaks with a space
    text = text.replace('<br>', ' ')     # Also handle <br>
    
    # Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
    clean_text = re.sub(r'<[^>]+>', '', text)
    
    return clean_text.lower()