Spaces:
Sleeping
Sleeping
Commit
·
ff2fbf2
1
Parent(s):
58ee4d2
scraper
Browse files- Dockerfile +9 -36
- agents/design_phase/__init__.py +1 -1
- agents/design_phase/scraper/extractor_crawlee.py +90 -89
- routers/scraper_route.py +96 -94
- tools/__init__.py +2 -2
- tools/scraper/no_agent/crawlee_scraper.py +22 -22
- tools/scraper/scraper_crawlee.py +80 -80
Dockerfile
CHANGED
|
@@ -1,45 +1,18 @@
|
|
| 1 |
-
#
|
| 2 |
-
FROM python:3.10
|
| 3 |
|
| 4 |
-
#
|
| 5 |
-
RUN apt-get update && apt-get install -y \
|
| 6 |
-
wget \
|
| 7 |
-
curl \
|
| 8 |
-
unzip \
|
| 9 |
-
gnupg \
|
| 10 |
-
libglib2.0-0 \
|
| 11 |
-
libnss3 \
|
| 12 |
-
libgdk-pixbuf-xlib-2.0-0 \
|
| 13 |
-
libgtk-3-0 \
|
| 14 |
-
libxcomposite1 \
|
| 15 |
-
libxdamage1 \
|
| 16 |
-
libxrandr2 \
|
| 17 |
-
libxss1 \
|
| 18 |
-
libasound2 \
|
| 19 |
-
libxshmfence1 \
|
| 20 |
-
libgbm1 \
|
| 21 |
-
libpango-1.0-0 \
|
| 22 |
-
libpangocairo-1.0-0 \
|
| 23 |
-
libcairo2 \
|
| 24 |
-
fonts-liberation \
|
| 25 |
-
libappindicator3-1 \
|
| 26 |
-
xdg-utils \
|
| 27 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 28 |
-
|
| 29 |
-
# ===== Work Directory =====
|
| 30 |
WORKDIR /app
|
| 31 |
|
| 32 |
-
# ===== Install Python requirements =====
|
| 33 |
-
COPY requirements.txt .
|
| 34 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 35 |
-
|
| 36 |
-
# ===== Install Playwright Browsers =====
|
| 37 |
-
RUN playwright install --with-deps chromium
|
| 38 |
-
|
| 39 |
-
|
| 40 |
# انسخ ملفات المشروع
|
| 41 |
COPY . /app
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# عرّف البورت اللي البرنامج هيشتغل عليه
|
| 45 |
EXPOSE 7860
|
|
|
|
| 1 |
+
# استخدم صورة أساسية خفيفة من Python
|
| 2 |
+
FROM python:3.10
|
| 3 |
|
| 4 |
+
# اضبط مجلد العمل داخل الحاوية
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
WORKDIR /app
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
# انسخ ملفات المشروع
|
| 8 |
COPY . /app
|
| 9 |
|
| 10 |
+
# انسخ ملف requirements لو عندك
|
| 11 |
+
COPY requirements.txt .
|
| 12 |
+
|
| 13 |
+
# نزّل المتطلبات
|
| 14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
+
|
| 16 |
|
| 17 |
# عرّف البورت اللي البرنامج هيشتغل عليه
|
| 18 |
EXPOSE 7860
|
agents/design_phase/__init__.py
CHANGED
|
@@ -2,4 +2,4 @@ from .keywoard_researcher import keyword_researcher_agent, keyword_researcher_ta
|
|
| 2 |
from .source_finder import search_engine_agent, search_engine_task
|
| 3 |
from .scraper.extractor_bulit_in import scraping_built_in_agent, scraping_built_in_task
|
| 4 |
from .scraper.extractor_bs4 import scraping_bs4_agent, scraping_bs4_task
|
| 5 |
-
from .scraper.extractor_crawlee import scraping_crawlee_agent, scraping_crawlee_task
|
|
|
|
| 2 |
from .source_finder import search_engine_agent, search_engine_task
|
| 3 |
from .scraper.extractor_bulit_in import scraping_built_in_agent, scraping_built_in_task
|
| 4 |
from .scraper.extractor_bs4 import scraping_bs4_agent, scraping_bs4_task
|
| 5 |
+
# from .scraper.extractor_crawlee import scraping_crawlee_agent, scraping_crawlee_task
|
agents/design_phase/scraper/extractor_crawlee.py
CHANGED
|
@@ -1,94 +1,95 @@
|
|
| 1 |
-
from crewai import Agent, Task
|
| 2 |
-
from modules import llm_g
|
| 3 |
-
from tools import pdf_tool, scraping_tool
|
| 4 |
-
from tools import WebScrapingCrawleeTool
|
| 5 |
-
from schemas import UnitSubtopicOutputModel
|
| 6 |
|
| 7 |
|
| 8 |
-
web_scraper = WebScrapingCrawleeTool()
|
| 9 |
|
| 10 |
-
scraping_crawlee_agent = Agent(
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
)
|
| 39 |
|
| 40 |
|
| 41 |
-
scraping_crawlee_task = Task(
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
)
|
|
|
|
|
|
| 1 |
+
# from crewai import Agent, Task
|
| 2 |
+
# from modules import llm_g
|
| 3 |
+
# from tools import pdf_tool, scraping_tool
|
| 4 |
+
# from tools import WebScrapingCrawleeTool
|
| 5 |
+
# from schemas import UnitSubtopicOutputModel
|
| 6 |
|
| 7 |
|
| 8 |
+
# web_scraper = WebScrapingCrawleeTool()
|
| 9 |
|
| 10 |
+
# scraping_crawlee_agent = Agent(
|
| 11 |
+
# role="Educational Content Scraping & Knowledge Extraction Agent",
|
| 12 |
+
# goal="\n".join(
|
| 13 |
+
# [
|
| 14 |
+
# "Collect and extract complete, structured, and educationally valuable content "
|
| 15 |
+
# "from Arabic and English websites and PDFs related to the course topic: {topic}.",
|
| 16 |
+
# "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
|
| 17 |
+
# "and audience ({audience}).",
|
| 18 |
+
# "Prioritize materials that can serve as strong foundations for creating {material_type} "
|
| 19 |
+
# "learning materials (conceptual, structural, procedural, and real-world).",
|
| 20 |
+
# "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
|
| 21 |
+
# "Assess each source’s credibility and educational relevance, ranking them by usefulness for course design.",
|
| 22 |
+
# "Provide concise expert notes and recommendations that will assist curriculum developers and instructional designers "
|
| 23 |
+
# "in selecting the best materials for building a complete learning unit.",
|
| 24 |
+
# ]
|
| 25 |
+
# ),
|
| 26 |
+
# backstory="\n".join(
|
| 27 |
+
# [
|
| 28 |
+
# "You are a specialized educational data researcher trained to explore, extract, and organize academic and professional content.",
|
| 29 |
+
# "You excel at discovering high-quality Arabic and English resources that align with specific course development objectives.",
|
| 30 |
+
# "Your mission is to help course designers collect trustworthy, pedagogically sound materials that will form the backbone of educational units.",
|
| 31 |
+
# "You understand how to evaluate the quality, relevance, and credibility of both web pages and PDFs.",
|
| 32 |
+
# "You are particularly skilled at preserving Arabic text integrity and extracting complete structured information for learning materials.",
|
| 33 |
+
# ]
|
| 34 |
+
# ),
|
| 35 |
+
# llm=llm_g,
|
| 36 |
+
# tools=[web_scraper],
|
| 37 |
+
# verbose=True,
|
| 38 |
+
# )
|
| 39 |
|
| 40 |
|
| 41 |
+
# scraping_crawlee_task = Task(
|
| 42 |
+
# description="\n".join(
|
| 43 |
+
# [
|
| 44 |
+
# "Your task is to extract and organize full educational content from the following source:",
|
| 45 |
+
# "",
|
| 46 |
+
# "URL: {url}",
|
| 47 |
+
# "Unit Title: {unit_title}",
|
| 48 |
+
# "Subtopic Title: {subtopic_title}",
|
| 49 |
+
# "Query Used: {query}",
|
| 50 |
+
# "",
|
| 51 |
+
# "This link is part of the course topic '{topic}' under the domain '{domain}'.",
|
| 52 |
+
# "The extracted content should help create educational materials for the audience '{audience}', focusing on '{content_type}' learning goals.",
|
| 53 |
+
# "",
|
| 54 |
+
# "For the given URL:",
|
| 55 |
+
# " - Extract the full title, structured text, and any available media (images, videos, audios, PDFs).",
|
| 56 |
+
# " - Maintain the Arabic text structure and readability.",
|
| 57 |
+
# " - Evaluate its reliability and educational value in relation to {material_type} material categories.",
|
| 58 |
+
# " - Assign an agent recommendation rank (0–5) based on credibility and relevance.",
|
| 59 |
+
# " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
|
| 60 |
+
# "",
|
| 61 |
+
# "Ensure no important content, examples, or explanations are omitted from extraction.",
|
| 62 |
+
# ]
|
| 63 |
+
# ),
|
| 64 |
+
# expected_output=(
|
| 65 |
+
# "Return ONLY a valid Python dictionary.\n"
|
| 66 |
+
# "- Do not include explanations, markdown, or code fences.\n"
|
| 67 |
+
# "- The dictionary must be UTF-8 safe and directly usable in Python with ast.literal_eval.\n"
|
| 68 |
+
# "- Keys must be wrapped in double quotes.\n\n"
|
| 69 |
+
# "Format example:\n"
|
| 70 |
+
# "{\n"
|
| 71 |
+
# ' "unit_title": "من الفكرة إلى نموذج العمل: بناء الأساس الريادي",\n'
|
| 72 |
+
# ' "subtopic_title": "مفهوم ريادة الأعمال وأهميتها الاقتصادية والاجتماعية",\n'
|
| 73 |
+
# ' "query": "دور ريادة الأعمال في التنمية الاجتماعية",\n'
|
| 74 |
+
# ' "parts": [\n'
|
| 75 |
+
# " {\n"
|
| 76 |
+
# ' "page_url": "https://example.com/page1",\n'
|
| 77 |
+
# ' "title": "Understanding Entrepreneurship in the Arab World",\n'
|
| 78 |
+
# ' "content": "Full educational content extracted from the site.",\n'
|
| 79 |
+
# ' "img_url": ["https://example.com/image1.jpg"],\n'
|
| 80 |
+
# ' "video_url": ["https://example.com/video1.mp4"],\n'
|
| 81 |
+
# ' "audio_url": ["https://example.com/audio1.mp3"],\n'
|
| 82 |
+
# ' "pdf_url": ["https://example.com/file1.pdf"],\n'
|
| 83 |
+
# ' "agent_recommendation_rank": 4.8,\n'
|
| 84 |
+
# ' "agent_recommendation_notes": "Rich Arabic content, relevant to conceptual materials."\n'
|
| 85 |
+
# " }\n"
|
| 86 |
+
# " ]\n"
|
| 87 |
+
# "}\n\n"
|
| 88 |
+
# "Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
|
| 89 |
+
# "Ensure valid JSON syntax with no unterminated strings or extra text.\n"
|
| 90 |
+
# "Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
|
| 91 |
+
# ),
|
| 92 |
+
# agent=scraping_crawlee_agent,
|
| 93 |
+
# output_json=UnitSubtopicOutputModel,
|
| 94 |
+
# )
|
| 95 |
+
# #
|
routers/scraper_route.py
CHANGED
|
@@ -3,9 +3,9 @@ import json
|
|
| 3 |
import shutil
|
| 4 |
from fastapi import APIRouter, HTTPException, UploadFile, Form, Request
|
| 5 |
from crewai import Crew, Process
|
| 6 |
-
from agents.design_phase import scraping_built_in_agent, scraping_built_in_task
|
| 7 |
from schemas import DNAMetadata, OutlineInput
|
| 8 |
-
from tools import scrape_with_bs4
|
| 9 |
|
| 10 |
router = APIRouter(prefix="/design", tags=["Design"])
|
| 11 |
|
|
@@ -193,97 +193,97 @@ async def run_training(request: Request, file: UploadFile, data: str = Form(...)
|
|
| 193 |
"json_dict": output_data,
|
| 194 |
}
|
| 195 |
|
| 196 |
-
#################################
|
| 197 |
-
# crawlee #
|
| 198 |
-
#################################
|
| 199 |
-
@router.post("/scraper_crawlee")
|
| 200 |
-
async def run_training(request: Request, file: UploadFile, data: str = Form(...)):
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
|
| 288 |
|
| 289 |
##############################
|
|
@@ -332,7 +332,9 @@ async def process_json_scrape(request: Request, file: UploadFile, data: str, mod
|
|
| 332 |
if mode == "bs4":
|
| 333 |
scraped = scrape_with_bs4(url)
|
| 334 |
else:
|
| 335 |
-
|
|
|
|
|
|
|
| 336 |
|
| 337 |
all_results.append(
|
| 338 |
{
|
|
|
|
| 3 |
import shutil
|
| 4 |
from fastapi import APIRouter, HTTPException, UploadFile, Form, Request
|
| 5 |
from crewai import Crew, Process
|
| 6 |
+
from agents.design_phase import scraping_built_in_agent, scraping_built_in_task,scraping_bs4_agent,scraping_bs4_task
|
| 7 |
from schemas import DNAMetadata, OutlineInput
|
| 8 |
+
from tools import scrape_with_bs4#,crawl_url
|
| 9 |
|
| 10 |
router = APIRouter(prefix="/design", tags=["Design"])
|
| 11 |
|
|
|
|
| 193 |
"json_dict": output_data,
|
| 194 |
}
|
| 195 |
|
| 196 |
+
# #################################
|
| 197 |
+
# # crawlee #
|
| 198 |
+
# #################################
|
| 199 |
+
# @router.post("/scraper_crawlee")
|
| 200 |
+
# async def run_training(request: Request, file: UploadFile, data: str = Form(...)):
|
| 201 |
+
# """Uploads keywords JSON + metadata JSON, runs CrewAI search, returns download link."""
|
| 202 |
+
|
| 203 |
+
# # ✅ Parse metadata JSON
|
| 204 |
+
# try:
|
| 205 |
+
# parsed_data = json.loads(data)
|
| 206 |
+
# metadata = OutlineInput(**parsed_data)
|
| 207 |
+
# except Exception as e:
|
| 208 |
+
# raise HTTPException(status_code=400, detail=f"Invalid JSON in 'data': {e}")
|
| 209 |
+
|
| 210 |
+
# # ✅ Save uploaded file temporarily
|
| 211 |
+
# save_path = f"/tmp/{file.filename}"
|
| 212 |
+
# with open(save_path, "wb") as buffer:
|
| 213 |
+
# shutil.copyfileobj(file.file, buffer)
|
| 214 |
+
|
| 215 |
+
# # ✅ Validate file extension
|
| 216 |
+
# if not save_path.lower().endswith(".json"):
|
| 217 |
+
# raise HTTPException(status_code=400, detail="File must be a JSON file")
|
| 218 |
+
|
| 219 |
+
# # ✅ Load file content
|
| 220 |
+
# try:
|
| 221 |
+
# with open(save_path, "r", encoding="utf-8") as f:
|
| 222 |
+
# urls_data = json.load(f)
|
| 223 |
+
# except json.JSONDecodeError:
|
| 224 |
+
# raise HTTPException(status_code=400, detail="Invalid JSON file content")
|
| 225 |
+
|
| 226 |
+
# # ✅ Initialize Crew
|
| 227 |
+
# crew = Crew(
|
| 228 |
+
# agents=[scraping_built_in_agent],
|
| 229 |
+
# tasks=[scraping_built_in_task],
|
| 230 |
+
# process=Process.sequential,
|
| 231 |
+
# )
|
| 232 |
+
|
| 233 |
+
# # ✅ Build static user metadata
|
| 234 |
+
# user_inputs = DNAMetadata(
|
| 235 |
+
# topic=metadata.topic,
|
| 236 |
+
# domain=metadata.domain,
|
| 237 |
+
# content_type=metadata.content_type,
|
| 238 |
+
# audience=metadata.audience,
|
| 239 |
+
# material_type=metadata.material_type,
|
| 240 |
+
# ).dict()
|
| 241 |
+
|
| 242 |
+
# all_results = []
|
| 243 |
+
|
| 244 |
+
# # ✅ Iterate through each topic unit and result link
|
| 245 |
+
# for unit in urls_data["results"]:
|
| 246 |
+
# unit_title = unit["unit_title"]
|
| 247 |
+
# subtopic_title = unit["subtopic_title"]
|
| 248 |
+
# query = unit["query"]
|
| 249 |
+
|
| 250 |
+
# for result_item in unit["results"]:
|
| 251 |
+
# url = result_item["url"]
|
| 252 |
+
# print(f"🔍 Running scrape for [{subtopic_title}] | URL: {url}")
|
| 253 |
+
|
| 254 |
+
# merged_input = {
|
| 255 |
+
# **user_inputs,
|
| 256 |
+
# "url": url,
|
| 257 |
+
# "unit_title": unit_title,
|
| 258 |
+
# "subtopic_title": subtopic_title,
|
| 259 |
+
# "query": query,
|
| 260 |
+
# }
|
| 261 |
+
|
| 262 |
+
# try:
|
| 263 |
+
# result = crew.kickoff(inputs=merged_input)
|
| 264 |
+
# all_results.append(result.dict())
|
| 265 |
+
# except Exception as e:
|
| 266 |
+
# print(f"⚠️ Error while processing '{url}': {e}")
|
| 267 |
+
|
| 268 |
+
# # ✅ Save aggregated results
|
| 269 |
+
# output_data = {"results": all_results}
|
| 270 |
+
# output_file = "/tmp/search_results.json"
|
| 271 |
+
# with open(output_file, "w", encoding="utf-8") as f:
|
| 272 |
+
# json.dump(output_data, f, ensure_ascii=False, indent=2)
|
| 273 |
+
|
| 274 |
+
# # ✅ Build download URL
|
| 275 |
+
# base_url = str(request.base_url).rstrip("/")
|
| 276 |
+
# download_link = (
|
| 277 |
+
# f"{base_url}/design/download?filename={os.path.basename(output_file)}"
|
| 278 |
+
# )
|
| 279 |
+
|
| 280 |
+
# return {
|
| 281 |
+
# "message": "Scraping process completed successfully 🚀",
|
| 282 |
+
# "total_queries": len(all_results),
|
| 283 |
+
# "download_link": download_link,
|
| 284 |
+
# "result": all_results,
|
| 285 |
+
# "json_dict": output_data,
|
| 286 |
+
# }
|
| 287 |
|
| 288 |
|
| 289 |
##############################
|
|
|
|
| 332 |
if mode == "bs4":
|
| 333 |
scraped = scrape_with_bs4(url)
|
| 334 |
else:
|
| 335 |
+
|
| 336 |
+
#scraped = await crawl_url(url)
|
| 337 |
+
break
|
| 338 |
|
| 339 |
all_results.append(
|
| 340 |
{
|
tools/__init__.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
# from .chapter_division import split_json
|
| 2 |
# from .scraper import web_scraping_tool
|
| 3 |
# from .tavily import search_engine_tool, is_recent
|
| 4 |
-
from .scraper.scraper_crawlee import WebScrapingCrawleeTool
|
| 5 |
from .scraper.scraper_bs4 import WebScrapingToolBS4
|
| 6 |
from .scraper.scraper_built_in import pdf_tool, scraping_tool
|
| 7 |
from .validate_url import URLValidatorTool
|
| 8 |
from .scraper.no_agent.pdf_extractor import extract_pdf_content
|
| 9 |
from .scraper.no_agent.bs4_scraper import scrape_with_bs4
|
| 10 |
-
from .scraper.no_agent.crawlee_scraper import crawl_url
|
|
|
|
| 1 |
# from .chapter_division import split_json
|
| 2 |
# from .scraper import web_scraping_tool
|
| 3 |
# from .tavily import search_engine_tool, is_recent
|
| 4 |
+
# from .scraper.scraper_crawlee import WebScrapingCrawleeTool
|
| 5 |
from .scraper.scraper_bs4 import WebScrapingToolBS4
|
| 6 |
from .scraper.scraper_built_in import pdf_tool, scraping_tool
|
| 7 |
from .validate_url import URLValidatorTool
|
| 8 |
from .scraper.no_agent.pdf_extractor import extract_pdf_content
|
| 9 |
from .scraper.no_agent.bs4_scraper import scrape_with_bs4
|
| 10 |
+
# from .scraper.no_agent.crawlee_scraper import crawl_url
|
tools/scraper/no_agent/crawlee_scraper.py
CHANGED
|
@@ -1,31 +1,31 @@
|
|
| 1 |
-
from crawlee import HttpCrawler, run_crawler
|
| 2 |
-
from tools import extract_pdf_content
|
| 3 |
|
| 4 |
|
| 5 |
-
class SimpleCrawler(HttpCrawler):
|
| 6 |
-
|
| 7 |
-
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
|
| 21 |
-
async def crawl_url(url: str):
|
| 22 |
-
|
| 23 |
-
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
|
|
|
| 1 |
+
# from crawlee import HttpCrawler, run_crawler
|
| 2 |
+
# from tools import extract_pdf_content
|
| 3 |
|
| 4 |
|
| 5 |
+
# class SimpleCrawler(HttpCrawler):
|
| 6 |
+
# async def handle_page(self, context):
|
| 7 |
+
# url = context.request.url
|
| 8 |
|
| 9 |
+
# # PDF case
|
| 10 |
+
# if url.lower().endswith(".pdf"):
|
| 11 |
+
# content = extract_pdf_content(url)
|
| 12 |
+
# return {"url": url, "content": content}
|
| 13 |
|
| 14 |
+
# # HTML case
|
| 15 |
+
# html = await context.body
|
| 16 |
+
# content = html.decode("utf-8", errors="ignore")
|
| 17 |
|
| 18 |
+
# return {"url": url, "content": content}
|
| 19 |
|
| 20 |
|
| 21 |
+
# async def crawl_url(url: str):
|
| 22 |
+
# crawler = SimpleCrawler()
|
| 23 |
+
# results = []
|
| 24 |
|
| 25 |
+
# @crawler.router.default_handler
|
| 26 |
+
# async def handle_page(ctx):
|
| 27 |
+
# res = await crawler.handle_page(ctx)
|
| 28 |
+
# results.append(res)
|
| 29 |
|
| 30 |
+
# await run_crawler(crawler, start_urls=[url])
|
| 31 |
+
# return results[0]
|
tools/scraper/scraper_crawlee.py
CHANGED
|
@@ -1,90 +1,90 @@
|
|
| 1 |
-
from crewai.tools import BaseTool
|
| 2 |
-
from typing import Literal
|
| 3 |
|
| 4 |
-
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
|
| 5 |
-
from crawlee.http_clients import HttpxHttpClient
|
| 6 |
-
import fitz
|
| 7 |
-
import asyncio
|
| 8 |
-
import tempfile
|
| 9 |
-
import os
|
| 10 |
-
from urllib.parse import urlparse
|
| 11 |
|
| 12 |
|
| 13 |
-
class WebScrapingCrawleeTool(BaseTool):
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
|
|
|
| 1 |
+
# from crewai.tools import BaseTool
|
| 2 |
+
# from typing import Literal
|
| 3 |
|
| 4 |
+
# from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
|
| 5 |
+
# from crawlee.http_clients import HttpxHttpClient
|
| 6 |
+
# import fitz
|
| 7 |
+
# import asyncio
|
| 8 |
+
# import tempfile
|
| 9 |
+
# import os
|
| 10 |
+
# from urllib.parse import urlparse
|
| 11 |
|
| 12 |
|
| 13 |
+
# class WebScrapingCrawleeTool(BaseTool):
|
| 14 |
+
# name: Literal["web_scraping_crawlee_tool"]
|
| 15 |
+
# description: str = (
|
| 16 |
+
# "Scrapes Arabic/English content from webpages using Crawlee PlaywrightCrawler "
|
| 17 |
+
# "or extracts readable text from PDF files. Returns a dictionary: "
|
| 18 |
+
# "page_url, title, content, img_url, video_url, audio_url, pdf_url."
|
| 19 |
+
# )
|
| 20 |
|
| 21 |
+
# async def scrape_with_crawlee(self, target_url):
|
| 22 |
+
# crawler = PlaywrightCrawler()
|
| 23 |
+
# extracted_data = {}
|
| 24 |
|
| 25 |
+
# @crawler.router.default_handler
|
| 26 |
+
# async def default_handler(context: PlaywrightCrawlingContext) -> None:
|
| 27 |
+
# page = context.page
|
| 28 |
+
# title = await page.title()
|
| 29 |
+
# content = await page.content()
|
| 30 |
+
# images = await page.eval_on_selector_all(
|
| 31 |
+
# "img", "els => els.map(e => e.src)"
|
| 32 |
+
# )
|
| 33 |
+
# videos = await page.eval_on_selector_all(
|
| 34 |
+
# "video", "els => els.map(e => e.src)"
|
| 35 |
+
# )
|
| 36 |
+
# audios = await page.eval_on_selector_all(
|
| 37 |
+
# "audio", "els => els.map(e => e.src)"
|
| 38 |
+
# )
|
| 39 |
+
# pdfs = await page.eval_on_selector_all(
|
| 40 |
+
# "a[href$='.pdf']", "els => els.map(e => e.href)"
|
| 41 |
+
# )
|
| 42 |
|
| 43 |
+
# extracted_data.update(
|
| 44 |
+
# {
|
| 45 |
+
# "title": title,
|
| 46 |
+
# "content": content,
|
| 47 |
+
# "img_url": images,
|
| 48 |
+
# "video_url": videos,
|
| 49 |
+
# "audio_url": audios,
|
| 50 |
+
# "pdf_url": pdfs,
|
| 51 |
+
# }
|
| 52 |
+
# )
|
| 53 |
|
| 54 |
+
# await crawler.run([target_url])
|
| 55 |
+
# return extracted_data
|
| 56 |
|
| 57 |
+
# def extract_pdf_text(self, pdf_url):
|
| 58 |
+
# http = HttpxHttpClient()
|
| 59 |
+
# response = http.get(pdf_url)
|
| 60 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
| 61 |
+
# tmp.write(response.content)
|
| 62 |
+
# tmp_path = tmp.name
|
| 63 |
+
# text = ""
|
| 64 |
+
# with fitz.open(tmp_path) as doc:
|
| 65 |
+
# for page in doc:
|
| 66 |
+
# text += page.get_text("text")
|
| 67 |
+
# os.remove(tmp_path)
|
| 68 |
+
# return text.strip()
|
| 69 |
|
| 70 |
+
# def _run(self, url: str) -> dict:
|
| 71 |
+
# try:
|
| 72 |
+
# parsed = urlparse(url)
|
| 73 |
+
# if parsed.path.lower().endswith(".pdf"):
|
| 74 |
+
# content = self.extract_pdf_text(url)
|
| 75 |
+
# return {
|
| 76 |
+
# "page_url": url,
|
| 77 |
+
# "title": os.path.basename(parsed.path),
|
| 78 |
+
# "content": content,
|
| 79 |
+
# "img_url": [],
|
| 80 |
+
# "video_url": [],
|
| 81 |
+
# "audio_url": [],
|
| 82 |
+
# "pdf_url": [url],
|
| 83 |
+
# }
|
| 84 |
+
# scraped_data = asyncio.run(self.scrape_with_crawlee(url))
|
| 85 |
+
# return {"page_url": url, **scraped_data}
|
| 86 |
+
# except Exception as e:
|
| 87 |
+
# return {"error": str(e), "page_url": url}
|
| 88 |
|
| 89 |
+
# async def _arun(self, url: str) -> dict:
|
| 90 |
+
# return self._run(url)
|