Spaces:

vaveyko
/

Furniture-Shop-Parser

Sleeping

Upload 2 files

9fcac76 verified 3 months ago

1.3 kB

	import time

	import requests
	from bs4 import BeautifulSoup
	from typing import List


	class URLParser():
	def __init__(self):
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	def parse(self, urls: List[str,]) -> List[List[str,]]:
	return [self.parse_one(url) for url in urls]

	def parse_one(self, url: str) -> List[str,]:
	print("Request to site;", end=" ")
	try:
	response = requests.get(url, headers=self.headers, timeout=10)

	print("Get html;", end=" ")
	soup = BeautifulSoup(response.text, 'html.parser')

	except Exception as e:
	print("URL is not available, error")
	return []

	print("Parse html;")
	tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'a', 'span', 'p'])
	text = [tag.get_text(strip=True, separator=" ") for tag in tags]

	preprocessed_data = []
	for line in text:
	lower = line.lower()
	if line and line not in preprocessed_data:
	preprocessed_data.append(lower)
	return preprocessed_data

	def __del__(self):
	self.driver.quit()