f5_model_final / app /services /scraper_service.py
EL GHAFRAOUI AYOUB
C'
6f14d8b
from httpx import AsyncClient
from bs4 import BeautifulSoup
from typing import Tuple
from langchain_text_splitters import RecursiveCharacterTextSplitter
class ScraperService:
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=20,
length_function=len,
is_separator_regex=False,
)
async def scrape_website(self, url: str) -> str:
async with AsyncClient() as client:
chrome_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
response = await client.get(url, headers=chrome_headers)
return response.text
def extract_text_from_html(self, html: str) -> str:
soup = BeautifulSoup(html, 'html.parser')
# Remove script and style elements
for element in soup(['script', 'style', 'header', 'footer', 'nav']):
element.decompose()
text = soup.get_text(separator='\n', strip=True)
return text
async def scrape_and_process(self, url: str) -> Tuple[str, list]:
# Scrape the website
html = await self.scrape_website(url)
# Extract text
text = self.extract_text_from_html(html)
# Split into chunks
documents = self.text_splitter.create_documents([text])
return text, documents