Spaces:
Sleeping
Sleeping
| # from crewai.tools import BaseTool | |
| # from typing import Literal | |
| # from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext | |
| # from crawlee.http_clients import HttpxHttpClient | |
| # import fitz | |
| # import asyncio | |
| # import tempfile | |
| # import os | |
| # from urllib.parse import urlparse | |
| # class WebScrapingCrawleeTool(BaseTool): | |
| # name: Literal["web_scraping_crawlee_tool"] | |
| # description: str = ( | |
| # "Scrapes Arabic/English content from webpages using Crawlee PlaywrightCrawler " | |
| # "or extracts readable text from PDF files. Returns a dictionary: " | |
| # "page_url, title, content, img_url, video_url, audio_url, pdf_url." | |
| # ) | |
| # async def scrape_with_crawlee(self, target_url): | |
| # crawler = PlaywrightCrawler() | |
| # extracted_data = {} | |
| # @crawler.router.default_handler | |
| # async def default_handler(context: PlaywrightCrawlingContext) -> None: | |
| # page = context.page | |
| # title = await page.title() | |
| # content = await page.content() | |
| # images = await page.eval_on_selector_all( | |
| # "img", "els => els.map(e => e.src)" | |
| # ) | |
| # videos = await page.eval_on_selector_all( | |
| # "video", "els => els.map(e => e.src)" | |
| # ) | |
| # audios = await page.eval_on_selector_all( | |
| # "audio", "els => els.map(e => e.src)" | |
| # ) | |
| # pdfs = await page.eval_on_selector_all( | |
| # "a[href$='.pdf']", "els => els.map(e => e.href)" | |
| # ) | |
| # extracted_data.update( | |
| # { | |
| # "title": title, | |
| # "content": content, | |
| # "img_url": images, | |
| # "video_url": videos, | |
| # "audio_url": audios, | |
| # "pdf_url": pdfs, | |
| # } | |
| # ) | |
| # await crawler.run([target_url]) | |
| # return extracted_data | |
| # def extract_pdf_text(self, pdf_url): | |
| # http = HttpxHttpClient() | |
| # response = http.get(pdf_url) | |
| # with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: | |
| # tmp.write(response.content) | |
| # tmp_path = tmp.name | |
| # text = "" | |
| # with fitz.open(tmp_path) as doc: | |
| # for page in doc: | |
| # text += page.get_text("text") | |
| # os.remove(tmp_path) | |
| # return text.strip() | |
| # def _run(self, url: str) -> dict: | |
| # try: | |
| # parsed = urlparse(url) | |
| # if parsed.path.lower().endswith(".pdf"): | |
| # content = self.extract_pdf_text(url) | |
| # return { | |
| # "page_url": url, | |
| # "title": os.path.basename(parsed.path), | |
| # "content": content, | |
| # "img_url": [], | |
| # "video_url": [], | |
| # "audio_url": [], | |
| # "pdf_url": [url], | |
| # } | |
| # scraped_data = asyncio.run(self.scrape_with_crawlee(url)) | |
| # return {"page_url": url, **scraped_data} | |
| # except Exception as e: | |
| # return {"error": str(e), "page_url": url} | |
| # async def _arun(self, url: str) -> dict: | |
| # return self._run(url) | |