File size: 1,303 Bytes
469f119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fcac76
469f119
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import time

import requests
from bs4 import BeautifulSoup
from typing import List


class URLParser():
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def parse(self, urls: List[str,]) -> List[List[str,]]:
        return [self.parse_one(url) for url in urls]

    def parse_one(self, url: str) -> List[str,]:
        print("Request to site;", end=" ")
        try:
            response = requests.get(url, headers=self.headers, timeout=10)

            print("Get html;", end=" ")
            soup = BeautifulSoup(response.text, 'html.parser')

        except Exception as e:
            print("URL is not available, error")
            return []

        print("Parse html;")
        tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'a', 'span', 'p'])
        text = [tag.get_text(strip=True, separator=" ") for tag in tags]

        preprocessed_data = []
        for line in text:
            lower = line.lower()
            if line and line not in preprocessed_data:
                preprocessed_data.append(lower)
        return preprocessed_data

    def __del__(self):
        self.driver.quit()