Spaces:
Sleeping
Sleeping
| from httpx import AsyncClient | |
| from bs4 import BeautifulSoup | |
| from typing import Tuple | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| class ScraperService: | |
| def __init__(self): | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=20, | |
| length_function=len, | |
| is_separator_regex=False, | |
| ) | |
| async def scrape_website(self, url: str) -> str: | |
| async with AsyncClient() as client: | |
| chrome_headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| response = await client.get(url, headers=chrome_headers) | |
| return response.text | |
| def extract_text_from_html(self, html: str) -> str: | |
| soup = BeautifulSoup(html, 'html.parser') | |
| # Remove script and style elements | |
| for element in soup(['script', 'style', 'header', 'footer', 'nav']): | |
| element.decompose() | |
| text = soup.get_text(separator='\n', strip=True) | |
| return text | |
| async def scrape_and_process(self, url: str) -> Tuple[str, list]: | |
| # Scrape the website | |
| html = await self.scrape_website(url) | |
| # Extract text | |
| text = self.extract_text_from_html(html) | |
| # Split into chunks | |
| documents = self.text_splitter.create_documents([text]) | |
| return text, documents |