ContiAI-v4 / tools /scraper /scraper_crawlee.py
ziadsameh32's picture
Initial FastAPI CrewAI setup
1237322
# from crewai.tools import BaseTool
# from typing import Literal
# from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
# from crawlee.http_clients import HttpxHttpClient
# import fitz
# import asyncio
# import tempfile
# import os
# from urllib.parse import urlparse
# class WebScrapingCrawleeTool(BaseTool):
# name: Literal["web_scraping_crawlee_tool"]
# description: str = (
# "Scrapes Arabic/English content from webpages using Crawlee PlaywrightCrawler "
# "or extracts readable text from PDF files. Returns a dictionary: "
# "page_url, title, content, img_url, video_url, audio_url, pdf_url."
# )
# async def scrape_with_crawlee(self, target_url):
# crawler = PlaywrightCrawler()
# extracted_data = {}
# @crawler.router.default_handler
# async def default_handler(context: PlaywrightCrawlingContext) -> None:
# page = context.page
# title = await page.title()
# content = await page.content()
# images = await page.eval_on_selector_all(
# "img", "els => els.map(e => e.src)"
# )
# videos = await page.eval_on_selector_all(
# "video", "els => els.map(e => e.src)"
# )
# audios = await page.eval_on_selector_all(
# "audio", "els => els.map(e => e.src)"
# )
# pdfs = await page.eval_on_selector_all(
# "a[href$='.pdf']", "els => els.map(e => e.href)"
# )
# extracted_data.update(
# {
# "title": title,
# "content": content,
# "img_url": images,
# "video_url": videos,
# "audio_url": audios,
# "pdf_url": pdfs,
# }
# )
# await crawler.run([target_url])
# return extracted_data
# def extract_pdf_text(self, pdf_url):
# http = HttpxHttpClient()
# response = http.get(pdf_url)
# with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
# tmp.write(response.content)
# tmp_path = tmp.name
# text = ""
# with fitz.open(tmp_path) as doc:
# for page in doc:
# text += page.get_text("text")
# os.remove(tmp_path)
# return text.strip()
# def _run(self, url: str) -> dict:
# try:
# parsed = urlparse(url)
# if parsed.path.lower().endswith(".pdf"):
# content = self.extract_pdf_text(url)
# return {
# "page_url": url,
# "title": os.path.basename(parsed.path),
# "content": content,
# "img_url": [],
# "video_url": [],
# "audio_url": [],
# "pdf_url": [url],
# }
# scraped_data = asyncio.run(self.scrape_with_crawlee(url))
# return {"page_url": url, **scraped_data}
# except Exception as e:
# return {"error": str(e), "page_url": url}
# async def _arun(self, url: str) -> dict:
# return self._run(url)