import re def remove_html_tags(text): # Replace common HTML entities with their corresponding characters text = text.replace('"', '"') # Replace " text = text.replace('"', '"') # Also replace the named entity for " text = text.replace(''', "'") # Replace ' text = text.replace(''', "'") # Also replace the numeric entity for ' text = text.replace('&', '&') # Replace & text = text.replace('
', ' ') # Replace line breaks with a space text = text.replace('
', ' ') # Also handle
# Use regex to remove any remaining HTML tags (e.g.,

,

, ) clean_text = re.sub(r'<[^>]+>', '', text) return clean_text.lower()