Spaces:
Runtime error
Runtime error
| import requests | |
| from selectolax.parser import HTMLParser | |
| import re | |
| from string import punctuation | |
| def preprocess_text(text): | |
| text = text.lower() # Lowercase text | |
| # punctuation = r'\'\":' | |
| text = re.sub(f"[{re.escape(punctuation)}]", "", text) # Remove punctuation | |
| text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines | |
| return text | |
| def get_html(url): | |
| # request web page | |
| resp = requests.get(url) | |
| # get the response text. in this case it is HTML | |
| html = resp.text | |
| return html | |
| def get_text(html): | |
| tree = HTMLParser(html) | |
| if tree.body is None: | |
| return None | |
| for tag in tree.css('script'): | |
| tag.decompose() | |
| for tag in tree.css('style'): | |
| tag.decompose() | |
| # get the text from the body tag | |
| text = tree.body.text(separator='') | |
| # preprocess | |
| text = preprocess_text(text) | |
| return text | |
| def get_html_text(url): | |
| html = get_html(url) | |
| text = get_text(html) | |
| return text | |