Spaces:

ziadsameh32
/

ContiAI-v3

Sleeping

App Files Files Community

ziadsameh32 commited on Nov 26, 2025

Commit

ff2fbf2

1 Parent(s): 58ee4d2

scraper

Browse files

Files changed (7) hide show

Dockerfile +9 -36
agents/design_phase/__init__.py +1 -1
agents/design_phase/scraper/extractor_crawlee.py +90 -89
routers/scraper_route.py +96 -94
tools/__init__.py +2 -2
tools/scraper/no_agent/crawlee_scraper.py +22 -22
tools/scraper/scraper_crawlee.py +80 -80

Dockerfile CHANGED Viewed

@@ -1,45 +1,18 @@
-# ===== Base image =====
-FROM python:3.10-slim
-# ===== System dependencies =====
-RUN apt-get update && apt-get install -y \
-    wget \
-    curl \
-    unzip \
-    gnupg \
-    libglib2.0-0 \
-    libnss3 \
-    libgdk-pixbuf-xlib-2.0-0 \
-    libgtk-3-0 \
-    libxcomposite1 \
-    libxdamage1 \
-    libxrandr2 \
-    libxss1 \
-    libasound2 \
-    libxshmfence1 \
-    libgbm1 \
-    libpango-1.0-0 \
-    libpangocairo-1.0-0 \
-    libcairo2 \
-    fonts-liberation \
-    libappindicator3-1 \
-    xdg-utils \
-    && rm -rf /var/lib/apt/lists/*
-# ===== Work Directory =====
 WORKDIR /app
-# ===== Install Python requirements =====
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# ===== Install Playwright Browsers =====
-RUN playwright install --with-deps chromium
 # انسخ ملفات المشروع
 COPY . /app
 # عرّف البورت اللي البرنامج هيشتغل عليه
 EXPOSE 7860

+# استخدم صورة أساسية خفيفة من Python
+FROM python:3.10
+# اضبط مجلد العمل داخل الحاوية
 WORKDIR /app
 # انسخ ملفات المشروع
 COPY . /app
+# انسخ ملف requirements لو عندك
+COPY requirements.txt .
+# نزّل المتطلبات
+RUN pip install --no-cache-dir -r requirements.txt
 # عرّف البورت اللي البرنامج هيشتغل عليه
 EXPOSE 7860

agents/design_phase/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@ from .keywoard_researcher import keyword_researcher_agent, keyword_researcher_ta
 from .source_finder import search_engine_agent, search_engine_task
 from .scraper.extractor_bulit_in import scraping_built_in_agent, scraping_built_in_task
 from .scraper.extractor_bs4 import scraping_bs4_agent, scraping_bs4_task
-from .scraper.extractor_crawlee import scraping_crawlee_agent, scraping_crawlee_task

 from .source_finder import search_engine_agent, search_engine_task
 from .scraper.extractor_bulit_in import scraping_built_in_agent, scraping_built_in_task
 from .scraper.extractor_bs4 import scraping_bs4_agent, scraping_bs4_task
+# from .scraper.extractor_crawlee import scraping_crawlee_agent, scraping_crawlee_task

agents/design_phase/scraper/extractor_crawlee.py CHANGED Viewed

@@ -1,94 +1,95 @@
-from crewai import Agent, Task
-from modules import llm_g
-from tools import pdf_tool, scraping_tool
-from tools import WebScrapingCrawleeTool
-from schemas import UnitSubtopicOutputModel
-web_scraper = WebScrapingCrawleeTool()
-scraping_crawlee_agent = Agent(
-    role="Educational Content Scraping & Knowledge Extraction Agent",
-    goal="\n".join(
-        [
-            "Collect and extract complete, structured, and educationally valuable content "
-            "from Arabic and English websites and PDFs related to the course topic: {topic}.",
-            "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
-            "and audience ({audience}).",
-            "Prioritize materials that can serve as strong foundations for creating {material_type} "
-            "learning materials (conceptual, structural, procedural, and real-world).",
-            "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
-            "Assess each source’s credibility and educational relevance, ranking them by usefulness for course design.",
-            "Provide concise expert notes and recommendations that will assist curriculum developers and instructional designers "
-            "in selecting the best materials for building a complete learning unit.",
-        ]
-    ),
-    backstory="\n".join(
-        [
-            "You are a specialized educational data researcher trained to explore, extract, and organize academic and professional content.",
-            "You excel at discovering high-quality Arabic and English resources that align with specific course development objectives.",
-            "Your mission is to help course designers collect trustworthy, pedagogically sound materials that will form the backbone of educational units.",
-            "You understand how to evaluate the quality, relevance, and credibility of both web pages and PDFs.",
-            "You are particularly skilled at preserving Arabic text integrity and extracting complete structured information for learning materials.",
-        ]
-    ),
-    llm=llm_g,
-    tools=[web_scraper],
-    verbose=True,
-)
-scraping_crawlee_task = Task(
-    description="\n".join(
-        [
-            "Your task is to extract and organize full educational content from the following source:",
-            "",
-            "URL: {url}",
-            "Unit Title: {unit_title}",
-            "Subtopic Title: {subtopic_title}",
-            "Query Used: {query}",
-            "",
-            "This link is part of the course topic '{topic}' under the domain '{domain}'.",
-            "The extracted content should help create educational materials for the audience '{audience}', focusing on '{content_type}' learning goals.",
-            "",
-            "For the given URL:",
-            "  - Extract the full title, structured text, and any available media (images, videos, audios, PDFs).",
-            "  - Maintain the Arabic text structure and readability.",
-            "  - Evaluate its reliability and educational value in relation to {material_type} material categories.",
-            "  - Assign an agent recommendation rank (0–5) based on credibility and relevance.",
-            "  - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
-            "",
-            "Ensure no important content, examples, or explanations are omitted from extraction.",
-        ]
-    ),
-    expected_output=(
-        "Return ONLY a valid Python dictionary.\n"
-        "- Do not include explanations, markdown, or code fences.\n"
-        "- The dictionary must be UTF-8 safe and directly usable in Python with ast.literal_eval.\n"
-        "- Keys must be wrapped in double quotes.\n\n"
-        "Format example:\n"
-        "{\n"
-        '  "unit_title": "من الفكرة إلى نموذج العمل: بناء الأساس الريادي",\n'
-        '  "subtopic_title": "مفهوم ريادة الأعمال وأهميتها الاقتصادية والاجتماعية",\n'
-        '  "query": "دور ريادة الأعمال في التنمية الاجتماعية",\n'
-        '  "parts": [\n'
-        "    {\n"
-        '      "page_url": "https://example.com/page1",\n'
-        '      "title": "Understanding Entrepreneurship in the Arab World",\n'
-        '      "content": "Full educational content extracted from the site.",\n'
-        '      "img_url": ["https://example.com/image1.jpg"],\n'
-        '      "video_url": ["https://example.com/video1.mp4"],\n'
-        '      "audio_url": ["https://example.com/audio1.mp3"],\n'
-        '      "pdf_url": ["https://example.com/file1.pdf"],\n'
-        '      "agent_recommendation_rank": 4.8,\n'
-        '      "agent_recommendation_notes": "Rich Arabic content, relevant to conceptual materials."\n'
-        "    }\n"
-        "  ]\n"
-        "}\n\n"
-        "Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
-        "Ensure valid JSON syntax with no unterminated strings or extra text.\n"
-        "Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
-    ),
-    agent=scraping_crawlee_agent,
-    output_json=UnitSubtopicOutputModel,
-)

+# from crewai import Agent, Task
+# from modules import llm_g
+# from tools import pdf_tool, scraping_tool
+# from tools import WebScrapingCrawleeTool
+# from schemas import UnitSubtopicOutputModel
+# web_scraper = WebScrapingCrawleeTool()
+# scraping_crawlee_agent = Agent(
+#     role="Educational Content Scraping & Knowledge Extraction Agent",
+#     goal="\n".join(
+#         [
+#             "Collect and extract complete, structured, and educationally valuable content "
+#             "from Arabic and English websites and PDFs related to the course topic: {topic}.",
+#             "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
+#             "and audience ({audience}).",
+#             "Prioritize materials that can serve as strong foundations for creating {material_type} "
+#             "learning materials (conceptual, structural, procedural, and real-world).",
+#             "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
+#             "Assess each source’s credibility and educational relevance, ranking them by usefulness for course design.",
+#             "Provide concise expert notes and recommendations that will assist curriculum developers and instructional designers "
+#             "in selecting the best materials for building a complete learning unit.",
+#         ]
+#     ),
+#     backstory="\n".join(
+#         [
+#             "You are a specialized educational data researcher trained to explore, extract, and organize academic and professional content.",
+#             "You excel at discovering high-quality Arabic and English resources that align with specific course development objectives.",
+#             "Your mission is to help course designers collect trustworthy, pedagogically sound materials that will form the backbone of educational units.",
+#             "You understand how to evaluate the quality, relevance, and credibility of both web pages and PDFs.",
+#             "You are particularly skilled at preserving Arabic text integrity and extracting complete structured information for learning materials.",
+#         ]
+#     ),
+#     llm=llm_g,
+#     tools=[web_scraper],
+#     verbose=True,
+# )
+# scraping_crawlee_task = Task(
+#     description="\n".join(
+#         [
+#             "Your task is to extract and organize full educational content from the following source:",
+#             "",
+#             "URL: {url}",
+#             "Unit Title: {unit_title}",
+#             "Subtopic Title: {subtopic_title}",
+#             "Query Used: {query}",
+#             "",
+#             "This link is part of the course topic '{topic}' under the domain '{domain}'.",
+#             "The extracted content should help create educational materials for the audience '{audience}', focusing on '{content_type}' learning goals.",
+#             "",
+#             "For the given URL:",
+#             "  - Extract the full title, structured text, and any available media (images, videos, audios, PDFs).",
+#             "  - Maintain the Arabic text structure and readability.",
+#             "  - Evaluate its reliability and educational value in relation to {material_type} material categories.",
+#             "  - Assign an agent recommendation rank (0–5) based on credibility and relevance.",
+#             "  - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
+#             "",
+#             "Ensure no important content, examples, or explanations are omitted from extraction.",
+#         ]
+#     ),
+#     expected_output=(
+#         "Return ONLY a valid Python dictionary.\n"
+#         "- Do not include explanations, markdown, or code fences.\n"
+#         "- The dictionary must be UTF-8 safe and directly usable in Python with ast.literal_eval.\n"
+#         "- Keys must be wrapped in double quotes.\n\n"
+#         "Format example:\n"
+#         "{\n"
+#         '  "unit_title": "من الفكرة إلى نموذج العمل: بناء الأساس الريادي",\n'
+#         '  "subtopic_title": "مفهوم ريادة الأعمال وأهميتها الاقتصادية والاجتماعية",\n'
+#         '  "query": "دور ريادة الأعمال في التنمية الاجتماعية",\n'
+#         '  "parts": [\n'
+#         "    {\n"
+#         '      "page_url": "https://example.com/page1",\n'
+#         '      "title": "Understanding Entrepreneurship in the Arab World",\n'
+#         '      "content": "Full educational content extracted from the site.",\n'
+#         '      "img_url": ["https://example.com/image1.jpg"],\n'
+#         '      "video_url": ["https://example.com/video1.mp4"],\n'
+#         '      "audio_url": ["https://example.com/audio1.mp3"],\n'
+#         '      "pdf_url": ["https://example.com/file1.pdf"],\n'
+#         '      "agent_recommendation_rank": 4.8,\n'
+#         '      "agent_recommendation_notes": "Rich Arabic content, relevant to conceptual materials."\n'
+#         "    }\n"
+#         "  ]\n"
+#         "}\n\n"
+#         "Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
+#         "Ensure valid JSON syntax with no unterminated strings or extra text.\n"
+#         "Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
+#     ),
+#     agent=scraping_crawlee_agent,
+#     output_json=UnitSubtopicOutputModel,
+# )
+# #

routers/scraper_route.py CHANGED Viewed

@@ -3,9 +3,9 @@ import json
 import shutil
 from fastapi import APIRouter, HTTPException, UploadFile, Form, Request
 from crewai import Crew, Process
-from agents.design_phase import scraping_built_in_agent, scraping_built_in_task
 from schemas import DNAMetadata, OutlineInput
-from tools import scrape_with_bs4,crawl_url
 router = APIRouter(prefix="/design", tags=["Design"])
@@ -193,97 +193,97 @@ async def run_training(request: Request, file: UploadFile, data: str = Form(...)
         "json_dict": output_data,
     }
-#################################
-#           crawlee             #
-#################################
-@router.post("/scraper_crawlee")
-async def run_training(request: Request, file: UploadFile, data: str = Form(...)):
-    """Uploads keywords JSON + metadata JSON, runs CrewAI search, returns download link."""
-    # ✅ Parse metadata JSON
-    try:
-        parsed_data = json.loads(data)
-        metadata = OutlineInput(**parsed_data)
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Invalid JSON in 'data': {e}")
-    # ✅ Save uploaded file temporarily
-    save_path = f"/tmp/{file.filename}"
-    with open(save_path, "wb") as buffer:
-        shutil.copyfileobj(file.file, buffer)
-    # ✅ Validate file extension
-    if not save_path.lower().endswith(".json"):
-        raise HTTPException(status_code=400, detail="File must be a JSON file")
-    # ✅ Load file content
-    try:
-        with open(save_path, "r", encoding="utf-8") as f:
-            urls_data = json.load(f)
-    except json.JSONDecodeError:
-        raise HTTPException(status_code=400, detail="Invalid JSON file content")
-    # ✅ Initialize Crew
-    crew = Crew(
-        agents=[scraping_built_in_agent],
-        tasks=[scraping_built_in_task],
-        process=Process.sequential,
-    )
-    # ✅ Build static user metadata
-    user_inputs = DNAMetadata(
-        topic=metadata.topic,
-        domain=metadata.domain,
-        content_type=metadata.content_type,
-        audience=metadata.audience,
-        material_type=metadata.material_type,
-    ).dict()
-    all_results = []
-    # ✅ Iterate through each topic unit and result link
-    for unit in urls_data["results"]:
-        unit_title = unit["unit_title"]
-        subtopic_title = unit["subtopic_title"]
-        query = unit["query"]
-        for result_item in unit["results"]:
-            url = result_item["url"]
-            print(f"🔍 Running scrape for [{subtopic_title}] | URL: {url}")
-            merged_input = {
-                **user_inputs,
-                "url": url,
-                "unit_title": unit_title,
-                "subtopic_title": subtopic_title,
-                "query": query,
-            }
-            try:
-                result = crew.kickoff(inputs=merged_input)
-                all_results.append(result.dict())
-            except Exception as e:
-                print(f"⚠️ Error while processing '{url}': {e}")
-    # ✅ Save aggregated results
-    output_data = {"results": all_results}
-    output_file = "/tmp/search_results.json"
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(output_data, f, ensure_ascii=False, indent=2)
-    # ✅ Build download URL
-    base_url = str(request.base_url).rstrip("/")
-    download_link = (
-        f"{base_url}/design/download?filename={os.path.basename(output_file)}"
-    )
-    return {
-        "message": "Scraping process completed successfully 🚀",
-        "total_queries": len(all_results),
-        "download_link": download_link,
-        "result": all_results,
-        "json_dict": output_data,
-    }
 ##############################
@@ -332,7 +332,9 @@ async def process_json_scrape(request: Request, file: UploadFile, data: str, mod
                 if mode == "bs4":
                     scraped = scrape_with_bs4(url)
                 else:
-                    scraped = await crawl_url(url)
                 all_results.append(
                     {

 import shutil
 from fastapi import APIRouter, HTTPException, UploadFile, Form, Request
 from crewai import Crew, Process
+from agents.design_phase import scraping_built_in_agent, scraping_built_in_task,scraping_bs4_agent,scraping_bs4_task
 from schemas import DNAMetadata, OutlineInput
+from tools import scrape_with_bs4#,crawl_url
 router = APIRouter(prefix="/design", tags=["Design"])
         "json_dict": output_data,
     }
+# #################################
+# #           crawlee             #
+# #################################
+# @router.post("/scraper_crawlee")
+# async def run_training(request: Request, file: UploadFile, data: str = Form(...)):
+#     """Uploads keywords JSON + metadata JSON, runs CrewAI search, returns download link."""
+#     # ✅ Parse metadata JSON
+#     try:
+#         parsed_data = json.loads(data)
+#         metadata = OutlineInput(**parsed_data)
+#     except Exception as e:
+#         raise HTTPException(status_code=400, detail=f"Invalid JSON in 'data': {e}")
+#     # ✅ Save uploaded file temporarily
+#     save_path = f"/tmp/{file.filename}"
+#     with open(save_path, "wb") as buffer:
+#         shutil.copyfileobj(file.file, buffer)
+#     # ✅ Validate file extension
+#     if not save_path.lower().endswith(".json"):
+#         raise HTTPException(status_code=400, detail="File must be a JSON file")
+#     # ✅ Load file content
+#     try:
+#         with open(save_path, "r", encoding="utf-8") as f:
+#             urls_data = json.load(f)
+#     except json.JSONDecodeError:
+#         raise HTTPException(status_code=400, detail="Invalid JSON file content")
+#     # ✅ Initialize Crew
+#     crew = Crew(
+#         agents=[scraping_built_in_agent],
+#         tasks=[scraping_built_in_task],
+#         process=Process.sequential,
+#     )
+#     # ✅ Build static user metadata
+#     user_inputs = DNAMetadata(
+#         topic=metadata.topic,
+#         domain=metadata.domain,
+#         content_type=metadata.content_type,
+#         audience=metadata.audience,
+#         material_type=metadata.material_type,
+#     ).dict()
+#     all_results = []
+#     # ✅ Iterate through each topic unit and result link
+#     for unit in urls_data["results"]:
+#         unit_title = unit["unit_title"]
+#         subtopic_title = unit["subtopic_title"]
+#         query = unit["query"]
+#         for result_item in unit["results"]:
+#             url = result_item["url"]
+#             print(f"🔍 Running scrape for [{subtopic_title}] | URL: {url}")
+#             merged_input = {
+#                 **user_inputs,
+#                 "url": url,
+#                 "unit_title": unit_title,
+#                 "subtopic_title": subtopic_title,
+#                 "query": query,
+#             }
+#             try:
+#                 result = crew.kickoff(inputs=merged_input)
+#                 all_results.append(result.dict())
+#             except Exception as e:
+#                 print(f"⚠️ Error while processing '{url}': {e}")
+#     # ✅ Save aggregated results
+#     output_data = {"results": all_results}
+#     output_file = "/tmp/search_results.json"
+#     with open(output_file, "w", encoding="utf-8") as f:
+#         json.dump(output_data, f, ensure_ascii=False, indent=2)
+#     # ✅ Build download URL
+#     base_url = str(request.base_url).rstrip("/")
+#     download_link = (
+#         f"{base_url}/design/download?filename={os.path.basename(output_file)}"
+#     )
+#     return {
+#         "message": "Scraping process completed successfully 🚀",
+#         "total_queries": len(all_results),
+#         "download_link": download_link,
+#         "result": all_results,
+#         "json_dict": output_data,
+#     }
 ##############################
                 if mode == "bs4":
                     scraped = scrape_with_bs4(url)
                 else:
+                    #scraped = await crawl_url(url)
+                    break
                 all_results.append(
                     {

tools/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # from .chapter_division import split_json
 # from .scraper import web_scraping_tool
 # from .tavily import search_engine_tool, is_recent
-from .scraper.scraper_crawlee import WebScrapingCrawleeTool
 from .scraper.scraper_bs4 import WebScrapingToolBS4
 from .scraper.scraper_built_in import pdf_tool, scraping_tool
 from .validate_url import URLValidatorTool
 from .scraper.no_agent.pdf_extractor import extract_pdf_content
 from .scraper.no_agent.bs4_scraper import scrape_with_bs4
-from .scraper.no_agent.crawlee_scraper import crawl_url

 # from .chapter_division import split_json
 # from .scraper import web_scraping_tool
 # from .tavily import search_engine_tool, is_recent
+# from .scraper.scraper_crawlee import WebScrapingCrawleeTool
 from .scraper.scraper_bs4 import WebScrapingToolBS4
 from .scraper.scraper_built_in import pdf_tool, scraping_tool
 from .validate_url import URLValidatorTool
 from .scraper.no_agent.pdf_extractor import extract_pdf_content
 from .scraper.no_agent.bs4_scraper import scrape_with_bs4
+# from .scraper.no_agent.crawlee_scraper import crawl_url

tools/scraper/no_agent/crawlee_scraper.py CHANGED Viewed

@@ -1,31 +1,31 @@
-from crawlee import HttpCrawler, run_crawler
-from tools import extract_pdf_content
-class SimpleCrawler(HttpCrawler):
-    async def handle_page(self, context):
-        url = context.request.url
-        # PDF case
-        if url.lower().endswith(".pdf"):
-            content = extract_pdf_content(url)
-            return {"url": url, "content": content}
-        # HTML case
-        html = await context.body
-        content = html.decode("utf-8", errors="ignore")
-        return {"url": url, "content": content}
-async def crawl_url(url: str):
-    crawler = SimpleCrawler()
-    results = []
-    @crawler.router.default_handler
-    async def handle_page(ctx):
-        res = await crawler.handle_page(ctx)
-        results.append(res)
-    await run_crawler(crawler, start_urls=[url])
-    return results[0]

+# from crawlee import HttpCrawler, run_crawler
+# from tools import extract_pdf_content
+# class SimpleCrawler(HttpCrawler):
+#     async def handle_page(self, context):
+#         url = context.request.url
+#         # PDF case
+#         if url.lower().endswith(".pdf"):
+#             content = extract_pdf_content(url)
+#             return {"url": url, "content": content}
+#         # HTML case
+#         html = await context.body
+#         content = html.decode("utf-8", errors="ignore")
+#         return {"url": url, "content": content}
+# async def crawl_url(url: str):
+#     crawler = SimpleCrawler()
+#     results = []
+#     @crawler.router.default_handler
+#     async def handle_page(ctx):
+#         res = await crawler.handle_page(ctx)
+#         results.append(res)
+#     await run_crawler(crawler, start_urls=[url])
+#     return results[0]

tools/scraper/scraper_crawlee.py CHANGED Viewed

@@ -1,90 +1,90 @@
-from crewai.tools import BaseTool
-from typing import Literal
-from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
-from crawlee.http_clients import HttpxHttpClient
-import fitz
-import asyncio
-import tempfile
-import os
-from urllib.parse import urlparse
-class WebScrapingCrawleeTool(BaseTool):
-    name: Literal["web_scraping_crawlee_tool"]
-    description: str = (
-        "Scrapes Arabic/English content from webpages using Crawlee PlaywrightCrawler "
-        "or extracts readable text from PDF files. Returns a dictionary: "
-        "page_url, title, content, img_url, video_url, audio_url, pdf_url."
-    )
-    async def scrape_with_crawlee(self, target_url):
-        crawler = PlaywrightCrawler()
-        extracted_data = {}
-        @crawler.router.default_handler
-        async def default_handler(context: PlaywrightCrawlingContext) -> None:
-            page = context.page
-            title = await page.title()
-            content = await page.content()
-            images = await page.eval_on_selector_all(
-                "img", "els => els.map(e => e.src)"
-            )
-            videos = await page.eval_on_selector_all(
-                "video", "els => els.map(e => e.src)"
-            )
-            audios = await page.eval_on_selector_all(
-                "audio", "els => els.map(e => e.src)"
-            )
-            pdfs = await page.eval_on_selector_all(
-                "a[href$='.pdf']", "els => els.map(e => e.href)"
-            )
-            extracted_data.update(
-                {
-                    "title": title,
-                    "content": content,
-                    "img_url": images,
-                    "video_url": videos,
-                    "audio_url": audios,
-                    "pdf_url": pdfs,
-                }
-            )
-        await crawler.run([target_url])
-        return extracted_data
-    def extract_pdf_text(self, pdf_url):
-        http = HttpxHttpClient()
-        response = http.get(pdf_url)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
-            tmp.write(response.content)
-            tmp_path = tmp.name
-        text = ""
-        with fitz.open(tmp_path) as doc:
-            for page in doc:
-                text += page.get_text("text")
-        os.remove(tmp_path)
-        return text.strip()
-    def _run(self, url: str) -> dict:
-        try:
-            parsed = urlparse(url)
-            if parsed.path.lower().endswith(".pdf"):
-                content = self.extract_pdf_text(url)
-                return {
-                    "page_url": url,
-                    "title": os.path.basename(parsed.path),
-                    "content": content,
-                    "img_url": [],
-                    "video_url": [],
-                    "audio_url": [],
-                    "pdf_url": [url],
-                }
-            scraped_data = asyncio.run(self.scrape_with_crawlee(url))
-            return {"page_url": url, **scraped_data}
-        except Exception as e:
-            return {"error": str(e), "page_url": url}
-    async def _arun(self, url: str) -> dict:
-        return self._run(url)

+# from crewai.tools import BaseTool
+# from typing import Literal
+# from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
+# from crawlee.http_clients import HttpxHttpClient
+# import fitz
+# import asyncio
+# import tempfile
+# import os
+# from urllib.parse import urlparse
+# class WebScrapingCrawleeTool(BaseTool):
+#     name: Literal["web_scraping_crawlee_tool"]
+#     description: str = (
+#         "Scrapes Arabic/English content from webpages using Crawlee PlaywrightCrawler "
+#         "or extracts readable text from PDF files. Returns a dictionary: "
+#         "page_url, title, content, img_url, video_url, audio_url, pdf_url."
+#     )
+#     async def scrape_with_crawlee(self, target_url):
+#         crawler = PlaywrightCrawler()
+#         extracted_data = {}
+#         @crawler.router.default_handler
+#         async def default_handler(context: PlaywrightCrawlingContext) -> None:
+#             page = context.page
+#             title = await page.title()
+#             content = await page.content()
+#             images = await page.eval_on_selector_all(
+#                 "img", "els => els.map(e => e.src)"
+#             )
+#             videos = await page.eval_on_selector_all(
+#                 "video", "els => els.map(e => e.src)"
+#             )
+#             audios = await page.eval_on_selector_all(
+#                 "audio", "els => els.map(e => e.src)"
+#             )
+#             pdfs = await page.eval_on_selector_all(
+#                 "a[href$='.pdf']", "els => els.map(e => e.href)"
+#             )
+#             extracted_data.update(
+#                 {
+#                     "title": title,
+#                     "content": content,
+#                     "img_url": images,
+#                     "video_url": videos,
+#                     "audio_url": audios,
+#                     "pdf_url": pdfs,
+#                 }
+#             )
+#         await crawler.run([target_url])
+#         return extracted_data
+#     def extract_pdf_text(self, pdf_url):
+#         http = HttpxHttpClient()
+#         response = http.get(pdf_url)
+#         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+#             tmp.write(response.content)
+#             tmp_path = tmp.name
+#         text = ""
+#         with fitz.open(tmp_path) as doc:
+#             for page in doc:
+#                 text += page.get_text("text")
+#         os.remove(tmp_path)
+#         return text.strip()
+#     def _run(self, url: str) -> dict:
+#         try:
+#             parsed = urlparse(url)
+#             if parsed.path.lower().endswith(".pdf"):
+#                 content = self.extract_pdf_text(url)
+#                 return {
+#                     "page_url": url,
+#                     "title": os.path.basename(parsed.path),
+#                     "content": content,
+#                     "img_url": [],
+#                     "video_url": [],
+#                     "audio_url": [],
+#                     "pdf_url": [url],
+#                 }
+#             scraped_data = asyncio.run(self.scrape_with_crawlee(url))
+#             return {"page_url": url, **scraped_data}
+#         except Exception as e:
+#             return {"error": str(e), "page_url": url}
+#     async def _arun(self, url: str) -> dict:
+#         return self._run(url)