Spaces:

zolodickk
/

f5_model_final

Sleeping

f5_model_final / app /services /scraper_service.py

EL GHAFRAOUI AYOUB

6f14d8b about 1 year ago

1.69 kB

	from httpx import AsyncClient
	from bs4 import BeautifulSoup
	from typing import Tuple
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	class ScraperService:
	def __init__(self):
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=20,
	length_function=len,
	is_separator_regex=False,
	)

	async def scrape_website(self, url: str) -> str:
	async with AsyncClient() as client:
	chrome_headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.9",
	}
	response = await client.get(url, headers=chrome_headers)
	return response.text

	def extract_text_from_html(self, html: str) -> str:
	soup = BeautifulSoup(html, 'html.parser')

	# Remove script and style elements
	for element in soup(['script', 'style', 'header', 'footer', 'nav']):
	element.decompose()

	text = soup.get_text(separator='\n', strip=True)
	return text

	async def scrape_and_process(self, url: str) -> Tuple[str, list]:
	# Scrape the website
	html = await self.scrape_website(url)

	# Extract text
	text = self.extract_text_from_html(html)

	# Split into chunks
	documents = self.text_splitter.create_documents([text])

	return text, documents