Spaces:
Sleeping
Sleeping
| import time | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from typing import List | |
| class URLParser(): | |
| def __init__(self): | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| def parse(self, urls: List[str,]) -> List[List[str,]]: | |
| return [self.parse_one(url) for url in urls] | |
| def parse_one(self, url: str) -> List[str,]: | |
| print("Request to site;", end=" ") | |
| try: | |
| response = requests.get(url, headers=self.headers, timeout=10) | |
| print("Get html;", end=" ") | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| except Exception as e: | |
| print("URL is not available, error") | |
| return [] | |
| print("Parse html;") | |
| tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'a', 'span', 'p']) | |
| text = [tag.get_text(strip=True, separator=" ") for tag in tags] | |
| preprocessed_data = [] | |
| for line in text: | |
| lower = line.lower() | |
| if line and line not in preprocessed_data: | |
| preprocessed_data.append(lower) | |
| return preprocessed_data | |
| def __del__(self): | |
| self.driver.quit() | |