ziadsameh32 commited on
Commit
ff2fbf2
·
1 Parent(s): 58ee4d2
Dockerfile CHANGED
@@ -1,45 +1,18 @@
1
- # ===== Base image =====
2
- FROM python:3.10-slim
3
 
4
- # ===== System dependencies =====
5
- RUN apt-get update && apt-get install -y \
6
- wget \
7
- curl \
8
- unzip \
9
- gnupg \
10
- libglib2.0-0 \
11
- libnss3 \
12
- libgdk-pixbuf-xlib-2.0-0 \
13
- libgtk-3-0 \
14
- libxcomposite1 \
15
- libxdamage1 \
16
- libxrandr2 \
17
- libxss1 \
18
- libasound2 \
19
- libxshmfence1 \
20
- libgbm1 \
21
- libpango-1.0-0 \
22
- libpangocairo-1.0-0 \
23
- libcairo2 \
24
- fonts-liberation \
25
- libappindicator3-1 \
26
- xdg-utils \
27
- && rm -rf /var/lib/apt/lists/*
28
-
29
- # ===== Work Directory =====
30
  WORKDIR /app
31
 
32
- # ===== Install Python requirements =====
33
- COPY requirements.txt .
34
- RUN pip install --no-cache-dir -r requirements.txt
35
-
36
- # ===== Install Playwright Browsers =====
37
- RUN playwright install --with-deps chromium
38
-
39
-
40
  # انسخ ملفات المشروع
41
  COPY . /app
42
 
 
 
 
 
 
 
43
 
44
  # عرّف البورت اللي البرنامج هيشتغل عليه
45
  EXPOSE 7860
 
1
+ # استخدم صورة أساسية خفيفة من Python
2
+ FROM python:3.10
3
 
4
+ # اضبط مجلد العمل داخل الحاوية
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  WORKDIR /app
6
 
 
 
 
 
 
 
 
 
7
  # انسخ ملفات المشروع
8
  COPY . /app
9
 
10
+ # انسخ ملف requirements لو عندك
11
+ COPY requirements.txt .
12
+
13
+ # نزّل المتطلبات
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
 
17
  # عرّف البورت اللي البرنامج هيشتغل عليه
18
  EXPOSE 7860
agents/design_phase/__init__.py CHANGED
@@ -2,4 +2,4 @@ from .keywoard_researcher import keyword_researcher_agent, keyword_researcher_ta
2
  from .source_finder import search_engine_agent, search_engine_task
3
  from .scraper.extractor_bulit_in import scraping_built_in_agent, scraping_built_in_task
4
  from .scraper.extractor_bs4 import scraping_bs4_agent, scraping_bs4_task
5
- from .scraper.extractor_crawlee import scraping_crawlee_agent, scraping_crawlee_task
 
2
  from .source_finder import search_engine_agent, search_engine_task
3
  from .scraper.extractor_bulit_in import scraping_built_in_agent, scraping_built_in_task
4
  from .scraper.extractor_bs4 import scraping_bs4_agent, scraping_bs4_task
5
+ # from .scraper.extractor_crawlee import scraping_crawlee_agent, scraping_crawlee_task
agents/design_phase/scraper/extractor_crawlee.py CHANGED
@@ -1,94 +1,95 @@
1
- from crewai import Agent, Task
2
- from modules import llm_g
3
- from tools import pdf_tool, scraping_tool
4
- from tools import WebScrapingCrawleeTool
5
- from schemas import UnitSubtopicOutputModel
6
 
7
 
8
- web_scraper = WebScrapingCrawleeTool()
9
 
10
- scraping_crawlee_agent = Agent(
11
- role="Educational Content Scraping & Knowledge Extraction Agent",
12
- goal="\n".join(
13
- [
14
- "Collect and extract complete, structured, and educationally valuable content "
15
- "from Arabic and English websites and PDFs related to the course topic: {topic}.",
16
- "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
17
- "and audience ({audience}).",
18
- "Prioritize materials that can serve as strong foundations for creating {material_type} "
19
- "learning materials (conceptual, structural, procedural, and real-world).",
20
- "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
21
- "Assess each source’s credibility and educational relevance, ranking them by usefulness for course design.",
22
- "Provide concise expert notes and recommendations that will assist curriculum developers and instructional designers "
23
- "in selecting the best materials for building a complete learning unit.",
24
- ]
25
- ),
26
- backstory="\n".join(
27
- [
28
- "You are a specialized educational data researcher trained to explore, extract, and organize academic and professional content.",
29
- "You excel at discovering high-quality Arabic and English resources that align with specific course development objectives.",
30
- "Your mission is to help course designers collect trustworthy, pedagogically sound materials that will form the backbone of educational units.",
31
- "You understand how to evaluate the quality, relevance, and credibility of both web pages and PDFs.",
32
- "You are particularly skilled at preserving Arabic text integrity and extracting complete structured information for learning materials.",
33
- ]
34
- ),
35
- llm=llm_g,
36
- tools=[web_scraper],
37
- verbose=True,
38
- )
39
 
40
 
41
- scraping_crawlee_task = Task(
42
- description="\n".join(
43
- [
44
- "Your task is to extract and organize full educational content from the following source:",
45
- "",
46
- "URL: {url}",
47
- "Unit Title: {unit_title}",
48
- "Subtopic Title: {subtopic_title}",
49
- "Query Used: {query}",
50
- "",
51
- "This link is part of the course topic '{topic}' under the domain '{domain}'.",
52
- "The extracted content should help create educational materials for the audience '{audience}', focusing on '{content_type}' learning goals.",
53
- "",
54
- "For the given URL:",
55
- " - Extract the full title, structured text, and any available media (images, videos, audios, PDFs).",
56
- " - Maintain the Arabic text structure and readability.",
57
- " - Evaluate its reliability and educational value in relation to {material_type} material categories.",
58
- " - Assign an agent recommendation rank (0–5) based on credibility and relevance.",
59
- " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
60
- "",
61
- "Ensure no important content, examples, or explanations are omitted from extraction.",
62
- ]
63
- ),
64
- expected_output=(
65
- "Return ONLY a valid Python dictionary.\n"
66
- "- Do not include explanations, markdown, or code fences.\n"
67
- "- The dictionary must be UTF-8 safe and directly usable in Python with ast.literal_eval.\n"
68
- "- Keys must be wrapped in double quotes.\n\n"
69
- "Format example:\n"
70
- "{\n"
71
- ' "unit_title": "من الفكرة إلى نموذج العمل: بناء الأساس الريادي",\n'
72
- ' "subtopic_title": "مفهوم ريادة الأعمال وأهميتها الاقتصادية والاجتماعية",\n'
73
- ' "query": "دور ريادة الأعمال في التنمية الاجتماعية",\n'
74
- ' "parts": [\n'
75
- " {\n"
76
- ' "page_url": "https://example.com/page1",\n'
77
- ' "title": "Understanding Entrepreneurship in the Arab World",\n'
78
- ' "content": "Full educational content extracted from the site.",\n'
79
- ' "img_url": ["https://example.com/image1.jpg"],\n'
80
- ' "video_url": ["https://example.com/video1.mp4"],\n'
81
- ' "audio_url": ["https://example.com/audio1.mp3"],\n'
82
- ' "pdf_url": ["https://example.com/file1.pdf"],\n'
83
- ' "agent_recommendation_rank": 4.8,\n'
84
- ' "agent_recommendation_notes": "Rich Arabic content, relevant to conceptual materials."\n'
85
- " }\n"
86
- " ]\n"
87
- "}\n\n"
88
- "Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
89
- "Ensure valid JSON syntax with no unterminated strings or extra text.\n"
90
- "Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
91
- ),
92
- agent=scraping_crawlee_agent,
93
- output_json=UnitSubtopicOutputModel,
94
- )
 
 
1
+ # from crewai import Agent, Task
2
+ # from modules import llm_g
3
+ # from tools import pdf_tool, scraping_tool
4
+ # from tools import WebScrapingCrawleeTool
5
+ # from schemas import UnitSubtopicOutputModel
6
 
7
 
8
+ # web_scraper = WebScrapingCrawleeTool()
9
 
10
+ # scraping_crawlee_agent = Agent(
11
+ # role="Educational Content Scraping & Knowledge Extraction Agent",
12
+ # goal="\n".join(
13
+ # [
14
+ # "Collect and extract complete, structured, and educationally valuable content "
15
+ # "from Arabic and English websites and PDFs related to the course topic: {topic}.",
16
+ # "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
17
+ # "and audience ({audience}).",
18
+ # "Prioritize materials that can serve as strong foundations for creating {material_type} "
19
+ # "learning materials (conceptual, structural, procedural, and real-world).",
20
+ # "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
21
+ # "Assess each source’s credibility and educational relevance, ranking them by usefulness for course design.",
22
+ # "Provide concise expert notes and recommendations that will assist curriculum developers and instructional designers "
23
+ # "in selecting the best materials for building a complete learning unit.",
24
+ # ]
25
+ # ),
26
+ # backstory="\n".join(
27
+ # [
28
+ # "You are a specialized educational data researcher trained to explore, extract, and organize academic and professional content.",
29
+ # "You excel at discovering high-quality Arabic and English resources that align with specific course development objectives.",
30
+ # "Your mission is to help course designers collect trustworthy, pedagogically sound materials that will form the backbone of educational units.",
31
+ # "You understand how to evaluate the quality, relevance, and credibility of both web pages and PDFs.",
32
+ # "You are particularly skilled at preserving Arabic text integrity and extracting complete structured information for learning materials.",
33
+ # ]
34
+ # ),
35
+ # llm=llm_g,
36
+ # tools=[web_scraper],
37
+ # verbose=True,
38
+ # )
39
 
40
 
41
+ # scraping_crawlee_task = Task(
42
+ # description="\n".join(
43
+ # [
44
+ # "Your task is to extract and organize full educational content from the following source:",
45
+ # "",
46
+ # "URL: {url}",
47
+ # "Unit Title: {unit_title}",
48
+ # "Subtopic Title: {subtopic_title}",
49
+ # "Query Used: {query}",
50
+ # "",
51
+ # "This link is part of the course topic '{topic}' under the domain '{domain}'.",
52
+ # "The extracted content should help create educational materials for the audience '{audience}', focusing on '{content_type}' learning goals.",
53
+ # "",
54
+ # "For the given URL:",
55
+ # " - Extract the full title, structured text, and any available media (images, videos, audios, PDFs).",
56
+ # " - Maintain the Arabic text structure and readability.",
57
+ # " - Evaluate its reliability and educational value in relation to {material_type} material categories.",
58
+ # " - Assign an agent recommendation rank (0–5) based on credibility and relevance.",
59
+ # " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
60
+ # "",
61
+ # "Ensure no important content, examples, or explanations are omitted from extraction.",
62
+ # ]
63
+ # ),
64
+ # expected_output=(
65
+ # "Return ONLY a valid Python dictionary.\n"
66
+ # "- Do not include explanations, markdown, or code fences.\n"
67
+ # "- The dictionary must be UTF-8 safe and directly usable in Python with ast.literal_eval.\n"
68
+ # "- Keys must be wrapped in double quotes.\n\n"
69
+ # "Format example:\n"
70
+ # "{\n"
71
+ # ' "unit_title": "من الفكرة إلى نموذج العمل: بناء الأساس الريادي",\n'
72
+ # ' "subtopic_title": "مفهوم ريادة الأعمال وأهميتها الاقتصادية والاجتماعية",\n'
73
+ # ' "query": "دور ريادة الأعمال في التنمية الاجتماعية",\n'
74
+ # ' "parts": [\n'
75
+ # " {\n"
76
+ # ' "page_url": "https://example.com/page1",\n'
77
+ # ' "title": "Understanding Entrepreneurship in the Arab World",\n'
78
+ # ' "content": "Full educational content extracted from the site.",\n'
79
+ # ' "img_url": ["https://example.com/image1.jpg"],\n'
80
+ # ' "video_url": ["https://example.com/video1.mp4"],\n'
81
+ # ' "audio_url": ["https://example.com/audio1.mp3"],\n'
82
+ # ' "pdf_url": ["https://example.com/file1.pdf"],\n'
83
+ # ' "agent_recommendation_rank": 4.8,\n'
84
+ # ' "agent_recommendation_notes": "Rich Arabic content, relevant to conceptual materials."\n'
85
+ # " }\n"
86
+ # " ]\n"
87
+ # "}\n\n"
88
+ # "Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
89
+ # "Ensure valid JSON syntax with no unterminated strings or extra text.\n"
90
+ # "Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
91
+ # ),
92
+ # agent=scraping_crawlee_agent,
93
+ # output_json=UnitSubtopicOutputModel,
94
+ # )
95
+ # #
routers/scraper_route.py CHANGED
@@ -3,9 +3,9 @@ import json
3
  import shutil
4
  from fastapi import APIRouter, HTTPException, UploadFile, Form, Request
5
  from crewai import Crew, Process
6
- from agents.design_phase import scraping_built_in_agent, scraping_built_in_task
7
  from schemas import DNAMetadata, OutlineInput
8
- from tools import scrape_with_bs4,crawl_url
9
 
10
  router = APIRouter(prefix="/design", tags=["Design"])
11
 
@@ -193,97 +193,97 @@ async def run_training(request: Request, file: UploadFile, data: str = Form(...)
193
  "json_dict": output_data,
194
  }
195
 
196
- #################################
197
- # crawlee #
198
- #################################
199
- @router.post("/scraper_crawlee")
200
- async def run_training(request: Request, file: UploadFile, data: str = Form(...)):
201
- """Uploads keywords JSON + metadata JSON, runs CrewAI search, returns download link."""
202
-
203
- # ✅ Parse metadata JSON
204
- try:
205
- parsed_data = json.loads(data)
206
- metadata = OutlineInput(**parsed_data)
207
- except Exception as e:
208
- raise HTTPException(status_code=400, detail=f"Invalid JSON in 'data': {e}")
209
-
210
- # ✅ Save uploaded file temporarily
211
- save_path = f"/tmp/{file.filename}"
212
- with open(save_path, "wb") as buffer:
213
- shutil.copyfileobj(file.file, buffer)
214
-
215
- # ✅ Validate file extension
216
- if not save_path.lower().endswith(".json"):
217
- raise HTTPException(status_code=400, detail="File must be a JSON file")
218
-
219
- # ✅ Load file content
220
- try:
221
- with open(save_path, "r", encoding="utf-8") as f:
222
- urls_data = json.load(f)
223
- except json.JSONDecodeError:
224
- raise HTTPException(status_code=400, detail="Invalid JSON file content")
225
-
226
- # ✅ Initialize Crew
227
- crew = Crew(
228
- agents=[scraping_built_in_agent],
229
- tasks=[scraping_built_in_task],
230
- process=Process.sequential,
231
- )
232
-
233
- # ✅ Build static user metadata
234
- user_inputs = DNAMetadata(
235
- topic=metadata.topic,
236
- domain=metadata.domain,
237
- content_type=metadata.content_type,
238
- audience=metadata.audience,
239
- material_type=metadata.material_type,
240
- ).dict()
241
-
242
- all_results = []
243
-
244
- # ✅ Iterate through each topic unit and result link
245
- for unit in urls_data["results"]:
246
- unit_title = unit["unit_title"]
247
- subtopic_title = unit["subtopic_title"]
248
- query = unit["query"]
249
-
250
- for result_item in unit["results"]:
251
- url = result_item["url"]
252
- print(f"🔍 Running scrape for [{subtopic_title}] | URL: {url}")
253
-
254
- merged_input = {
255
- **user_inputs,
256
- "url": url,
257
- "unit_title": unit_title,
258
- "subtopic_title": subtopic_title,
259
- "query": query,
260
- }
261
-
262
- try:
263
- result = crew.kickoff(inputs=merged_input)
264
- all_results.append(result.dict())
265
- except Exception as e:
266
- print(f"⚠️ Error while processing '{url}': {e}")
267
-
268
- # ✅ Save aggregated results
269
- output_data = {"results": all_results}
270
- output_file = "/tmp/search_results.json"
271
- with open(output_file, "w", encoding="utf-8") as f:
272
- json.dump(output_data, f, ensure_ascii=False, indent=2)
273
-
274
- # ✅ Build download URL
275
- base_url = str(request.base_url).rstrip("/")
276
- download_link = (
277
- f"{base_url}/design/download?filename={os.path.basename(output_file)}"
278
- )
279
-
280
- return {
281
- "message": "Scraping process completed successfully 🚀",
282
- "total_queries": len(all_results),
283
- "download_link": download_link,
284
- "result": all_results,
285
- "json_dict": output_data,
286
- }
287
 
288
 
289
  ##############################
@@ -332,7 +332,9 @@ async def process_json_scrape(request: Request, file: UploadFile, data: str, mod
332
  if mode == "bs4":
333
  scraped = scrape_with_bs4(url)
334
  else:
335
- scraped = await crawl_url(url)
 
 
336
 
337
  all_results.append(
338
  {
 
3
  import shutil
4
  from fastapi import APIRouter, HTTPException, UploadFile, Form, Request
5
  from crewai import Crew, Process
6
+ from agents.design_phase import scraping_built_in_agent, scraping_built_in_task,scraping_bs4_agent,scraping_bs4_task
7
  from schemas import DNAMetadata, OutlineInput
8
+ from tools import scrape_with_bs4#,crawl_url
9
 
10
  router = APIRouter(prefix="/design", tags=["Design"])
11
 
 
193
  "json_dict": output_data,
194
  }
195
 
196
+ # #################################
197
+ # # crawlee #
198
+ # #################################
199
+ # @router.post("/scraper_crawlee")
200
+ # async def run_training(request: Request, file: UploadFile, data: str = Form(...)):
201
+ # """Uploads keywords JSON + metadata JSON, runs CrewAI search, returns download link."""
202
+
203
+ # # ✅ Parse metadata JSON
204
+ # try:
205
+ # parsed_data = json.loads(data)
206
+ # metadata = OutlineInput(**parsed_data)
207
+ # except Exception as e:
208
+ # raise HTTPException(status_code=400, detail=f"Invalid JSON in 'data': {e}")
209
+
210
+ # # ✅ Save uploaded file temporarily
211
+ # save_path = f"/tmp/{file.filename}"
212
+ # with open(save_path, "wb") as buffer:
213
+ # shutil.copyfileobj(file.file, buffer)
214
+
215
+ # # ✅ Validate file extension
216
+ # if not save_path.lower().endswith(".json"):
217
+ # raise HTTPException(status_code=400, detail="File must be a JSON file")
218
+
219
+ # # ✅ Load file content
220
+ # try:
221
+ # with open(save_path, "r", encoding="utf-8") as f:
222
+ # urls_data = json.load(f)
223
+ # except json.JSONDecodeError:
224
+ # raise HTTPException(status_code=400, detail="Invalid JSON file content")
225
+
226
+ # # ✅ Initialize Crew
227
+ # crew = Crew(
228
+ # agents=[scraping_built_in_agent],
229
+ # tasks=[scraping_built_in_task],
230
+ # process=Process.sequential,
231
+ # )
232
+
233
+ # # ✅ Build static user metadata
234
+ # user_inputs = DNAMetadata(
235
+ # topic=metadata.topic,
236
+ # domain=metadata.domain,
237
+ # content_type=metadata.content_type,
238
+ # audience=metadata.audience,
239
+ # material_type=metadata.material_type,
240
+ # ).dict()
241
+
242
+ # all_results = []
243
+
244
+ # # ✅ Iterate through each topic unit and result link
245
+ # for unit in urls_data["results"]:
246
+ # unit_title = unit["unit_title"]
247
+ # subtopic_title = unit["subtopic_title"]
248
+ # query = unit["query"]
249
+
250
+ # for result_item in unit["results"]:
251
+ # url = result_item["url"]
252
+ # print(f"🔍 Running scrape for [{subtopic_title}] | URL: {url}")
253
+
254
+ # merged_input = {
255
+ # **user_inputs,
256
+ # "url": url,
257
+ # "unit_title": unit_title,
258
+ # "subtopic_title": subtopic_title,
259
+ # "query": query,
260
+ # }
261
+
262
+ # try:
263
+ # result = crew.kickoff(inputs=merged_input)
264
+ # all_results.append(result.dict())
265
+ # except Exception as e:
266
+ # print(f"⚠️ Error while processing '{url}': {e}")
267
+
268
+ # # ✅ Save aggregated results
269
+ # output_data = {"results": all_results}
270
+ # output_file = "/tmp/search_results.json"
271
+ # with open(output_file, "w", encoding="utf-8") as f:
272
+ # json.dump(output_data, f, ensure_ascii=False, indent=2)
273
+
274
+ # # ✅ Build download URL
275
+ # base_url = str(request.base_url).rstrip("/")
276
+ # download_link = (
277
+ # f"{base_url}/design/download?filename={os.path.basename(output_file)}"
278
+ # )
279
+
280
+ # return {
281
+ # "message": "Scraping process completed successfully 🚀",
282
+ # "total_queries": len(all_results),
283
+ # "download_link": download_link,
284
+ # "result": all_results,
285
+ # "json_dict": output_data,
286
+ # }
287
 
288
 
289
  ##############################
 
332
  if mode == "bs4":
333
  scraped = scrape_with_bs4(url)
334
  else:
335
+
336
+ #scraped = await crawl_url(url)
337
+ break
338
 
339
  all_results.append(
340
  {
tools/__init__.py CHANGED
@@ -1,10 +1,10 @@
1
  # from .chapter_division import split_json
2
  # from .scraper import web_scraping_tool
3
  # from .tavily import search_engine_tool, is_recent
4
- from .scraper.scraper_crawlee import WebScrapingCrawleeTool
5
  from .scraper.scraper_bs4 import WebScrapingToolBS4
6
  from .scraper.scraper_built_in import pdf_tool, scraping_tool
7
  from .validate_url import URLValidatorTool
8
  from .scraper.no_agent.pdf_extractor import extract_pdf_content
9
  from .scraper.no_agent.bs4_scraper import scrape_with_bs4
10
- from .scraper.no_agent.crawlee_scraper import crawl_url
 
1
  # from .chapter_division import split_json
2
  # from .scraper import web_scraping_tool
3
  # from .tavily import search_engine_tool, is_recent
4
+ # from .scraper.scraper_crawlee import WebScrapingCrawleeTool
5
  from .scraper.scraper_bs4 import WebScrapingToolBS4
6
  from .scraper.scraper_built_in import pdf_tool, scraping_tool
7
  from .validate_url import URLValidatorTool
8
  from .scraper.no_agent.pdf_extractor import extract_pdf_content
9
  from .scraper.no_agent.bs4_scraper import scrape_with_bs4
10
+ # from .scraper.no_agent.crawlee_scraper import crawl_url
tools/scraper/no_agent/crawlee_scraper.py CHANGED
@@ -1,31 +1,31 @@
1
- from crawlee import HttpCrawler, run_crawler
2
- from tools import extract_pdf_content
3
 
4
 
5
- class SimpleCrawler(HttpCrawler):
6
- async def handle_page(self, context):
7
- url = context.request.url
8
 
9
- # PDF case
10
- if url.lower().endswith(".pdf"):
11
- content = extract_pdf_content(url)
12
- return {"url": url, "content": content}
13
 
14
- # HTML case
15
- html = await context.body
16
- content = html.decode("utf-8", errors="ignore")
17
 
18
- return {"url": url, "content": content}
19
 
20
 
21
- async def crawl_url(url: str):
22
- crawler = SimpleCrawler()
23
- results = []
24
 
25
- @crawler.router.default_handler
26
- async def handle_page(ctx):
27
- res = await crawler.handle_page(ctx)
28
- results.append(res)
29
 
30
- await run_crawler(crawler, start_urls=[url])
31
- return results[0]
 
1
+ # from crawlee import HttpCrawler, run_crawler
2
+ # from tools import extract_pdf_content
3
 
4
 
5
+ # class SimpleCrawler(HttpCrawler):
6
+ # async def handle_page(self, context):
7
+ # url = context.request.url
8
 
9
+ # # PDF case
10
+ # if url.lower().endswith(".pdf"):
11
+ # content = extract_pdf_content(url)
12
+ # return {"url": url, "content": content}
13
 
14
+ # # HTML case
15
+ # html = await context.body
16
+ # content = html.decode("utf-8", errors="ignore")
17
 
18
+ # return {"url": url, "content": content}
19
 
20
 
21
+ # async def crawl_url(url: str):
22
+ # crawler = SimpleCrawler()
23
+ # results = []
24
 
25
+ # @crawler.router.default_handler
26
+ # async def handle_page(ctx):
27
+ # res = await crawler.handle_page(ctx)
28
+ # results.append(res)
29
 
30
+ # await run_crawler(crawler, start_urls=[url])
31
+ # return results[0]
tools/scraper/scraper_crawlee.py CHANGED
@@ -1,90 +1,90 @@
1
- from crewai.tools import BaseTool
2
- from typing import Literal
3
 
4
- from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
5
- from crawlee.http_clients import HttpxHttpClient
6
- import fitz
7
- import asyncio
8
- import tempfile
9
- import os
10
- from urllib.parse import urlparse
11
 
12
 
13
- class WebScrapingCrawleeTool(BaseTool):
14
- name: Literal["web_scraping_crawlee_tool"]
15
- description: str = (
16
- "Scrapes Arabic/English content from webpages using Crawlee PlaywrightCrawler "
17
- "or extracts readable text from PDF files. Returns a dictionary: "
18
- "page_url, title, content, img_url, video_url, audio_url, pdf_url."
19
- )
20
 
21
- async def scrape_with_crawlee(self, target_url):
22
- crawler = PlaywrightCrawler()
23
- extracted_data = {}
24
 
25
- @crawler.router.default_handler
26
- async def default_handler(context: PlaywrightCrawlingContext) -> None:
27
- page = context.page
28
- title = await page.title()
29
- content = await page.content()
30
- images = await page.eval_on_selector_all(
31
- "img", "els => els.map(e => e.src)"
32
- )
33
- videos = await page.eval_on_selector_all(
34
- "video", "els => els.map(e => e.src)"
35
- )
36
- audios = await page.eval_on_selector_all(
37
- "audio", "els => els.map(e => e.src)"
38
- )
39
- pdfs = await page.eval_on_selector_all(
40
- "a[href$='.pdf']", "els => els.map(e => e.href)"
41
- )
42
 
43
- extracted_data.update(
44
- {
45
- "title": title,
46
- "content": content,
47
- "img_url": images,
48
- "video_url": videos,
49
- "audio_url": audios,
50
- "pdf_url": pdfs,
51
- }
52
- )
53
 
54
- await crawler.run([target_url])
55
- return extracted_data
56
 
57
- def extract_pdf_text(self, pdf_url):
58
- http = HttpxHttpClient()
59
- response = http.get(pdf_url)
60
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
61
- tmp.write(response.content)
62
- tmp_path = tmp.name
63
- text = ""
64
- with fitz.open(tmp_path) as doc:
65
- for page in doc:
66
- text += page.get_text("text")
67
- os.remove(tmp_path)
68
- return text.strip()
69
 
70
- def _run(self, url: str) -> dict:
71
- try:
72
- parsed = urlparse(url)
73
- if parsed.path.lower().endswith(".pdf"):
74
- content = self.extract_pdf_text(url)
75
- return {
76
- "page_url": url,
77
- "title": os.path.basename(parsed.path),
78
- "content": content,
79
- "img_url": [],
80
- "video_url": [],
81
- "audio_url": [],
82
- "pdf_url": [url],
83
- }
84
- scraped_data = asyncio.run(self.scrape_with_crawlee(url))
85
- return {"page_url": url, **scraped_data}
86
- except Exception as e:
87
- return {"error": str(e), "page_url": url}
88
 
89
- async def _arun(self, url: str) -> dict:
90
- return self._run(url)
 
1
+ # from crewai.tools import BaseTool
2
+ # from typing import Literal
3
 
4
+ # from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
5
+ # from crawlee.http_clients import HttpxHttpClient
6
+ # import fitz
7
+ # import asyncio
8
+ # import tempfile
9
+ # import os
10
+ # from urllib.parse import urlparse
11
 
12
 
13
+ # class WebScrapingCrawleeTool(BaseTool):
14
+ # name: Literal["web_scraping_crawlee_tool"]
15
+ # description: str = (
16
+ # "Scrapes Arabic/English content from webpages using Crawlee PlaywrightCrawler "
17
+ # "or extracts readable text from PDF files. Returns a dictionary: "
18
+ # "page_url, title, content, img_url, video_url, audio_url, pdf_url."
19
+ # )
20
 
21
+ # async def scrape_with_crawlee(self, target_url):
22
+ # crawler = PlaywrightCrawler()
23
+ # extracted_data = {}
24
 
25
+ # @crawler.router.default_handler
26
+ # async def default_handler(context: PlaywrightCrawlingContext) -> None:
27
+ # page = context.page
28
+ # title = await page.title()
29
+ # content = await page.content()
30
+ # images = await page.eval_on_selector_all(
31
+ # "img", "els => els.map(e => e.src)"
32
+ # )
33
+ # videos = await page.eval_on_selector_all(
34
+ # "video", "els => els.map(e => e.src)"
35
+ # )
36
+ # audios = await page.eval_on_selector_all(
37
+ # "audio", "els => els.map(e => e.src)"
38
+ # )
39
+ # pdfs = await page.eval_on_selector_all(
40
+ # "a[href$='.pdf']", "els => els.map(e => e.href)"
41
+ # )
42
 
43
+ # extracted_data.update(
44
+ # {
45
+ # "title": title,
46
+ # "content": content,
47
+ # "img_url": images,
48
+ # "video_url": videos,
49
+ # "audio_url": audios,
50
+ # "pdf_url": pdfs,
51
+ # }
52
+ # )
53
 
54
+ # await crawler.run([target_url])
55
+ # return extracted_data
56
 
57
+ # def extract_pdf_text(self, pdf_url):
58
+ # http = HttpxHttpClient()
59
+ # response = http.get(pdf_url)
60
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
61
+ # tmp.write(response.content)
62
+ # tmp_path = tmp.name
63
+ # text = ""
64
+ # with fitz.open(tmp_path) as doc:
65
+ # for page in doc:
66
+ # text += page.get_text("text")
67
+ # os.remove(tmp_path)
68
+ # return text.strip()
69
 
70
+ # def _run(self, url: str) -> dict:
71
+ # try:
72
+ # parsed = urlparse(url)
73
+ # if parsed.path.lower().endswith(".pdf"):
74
+ # content = self.extract_pdf_text(url)
75
+ # return {
76
+ # "page_url": url,
77
+ # "title": os.path.basename(parsed.path),
78
+ # "content": content,
79
+ # "img_url": [],
80
+ # "video_url": [],
81
+ # "audio_url": [],
82
+ # "pdf_url": [url],
83
+ # }
84
+ # scraped_data = asyncio.run(self.scrape_with_crawlee(url))
85
+ # return {"page_url": url, **scraped_data}
86
+ # except Exception as e:
87
+ # return {"error": str(e), "page_url": url}
88
 
89
+ # async def _arun(self, url: str) -> dict:
90
+ # return self._run(url)