ziadsameh32 commited on
Commit
200825a
·
1 Parent(s): 05635a4
agents/design_phase/scraper/extractor_bs4.py CHANGED
@@ -12,8 +12,8 @@ scraping_bs4_agent = Agent(
12
  goal="\n".join(
13
  [
14
  "Collect and extract complete, structured, and educationally valuable content "
15
- "from Arabic and English websites and PDFs related to the course topic: {topic}.",
16
- "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
17
  "and audience ({audience}).",
18
  "Prioritize materials that can serve as strong foundations for creating {material_type} "
19
  "learning materials (conceptual, structural, procedural, and real-world).",
@@ -59,6 +59,7 @@ scraping_bs4_task = Task(
59
  " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
60
  "",
61
  "Ensure no important content, examples, or explanations are omitted from extraction.",
 
62
  ]
63
  ),
64
  expected_output=(
@@ -88,6 +89,7 @@ scraping_bs4_task = Task(
88
  "Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
89
  "Ensure valid JSON syntax with no unterminated strings or extra text.\n"
90
  "Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
 
91
  ),
92
  agent=scraping_bs4_agent,
93
  output_json=UnitSubtopicOutputModel,
 
12
  goal="\n".join(
13
  [
14
  "Collect and extract complete, structured, and educationally valuable content "
15
+ "from Arabic and English websites and PDFs that i will give to you related to the course topic: {topic}.",
16
+ "Focus on content that match the course domain ({domain}), content type ({content_type}), "
17
  "and audience ({audience}).",
18
  "Prioritize materials that can serve as strong foundations for creating {material_type} "
19
  "learning materials (conceptual, structural, procedural, and real-world).",
 
59
  " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
60
  "",
61
  "Ensure no important content, examples, or explanations are omitted from extraction.",
62
+ "Output will be a json format with no task output or raw data only the formatted json dictionary.",
63
  ]
64
  ),
65
  expected_output=(
 
89
  "Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
90
  "Ensure valid JSON syntax with no unterminated strings or extra text.\n"
91
  "Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
92
+ "D"
93
  ),
94
  agent=scraping_bs4_agent,
95
  output_json=UnitSubtopicOutputModel,
agents/design_phase/scraper/extractor_bulit_in.py CHANGED
@@ -9,8 +9,8 @@ scraping_built_in_agent = Agent(
9
  goal="\n".join(
10
  [
11
  "Collect and extract complete, structured, and educationally valuable content "
12
- "from Arabic and English websites and PDFs related to the course topic: {topic}.",
13
- "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
14
  "and audience ({audience}).",
15
  "Prioritize materials that can serve as strong foundations for creating {material_type} "
16
  "learning materials (conceptual, structural, procedural, and real-world).",
@@ -56,6 +56,7 @@ scraping_built_in_task = Task(
56
  " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
57
  "",
58
  "Ensure no important content, examples, or explanations are omitted from extraction.",
 
59
  ]
60
  ),
61
  expected_output=(
 
9
  goal="\n".join(
10
  [
11
  "Collect and extract complete, structured, and educationally valuable content "
12
+ "from Arabic and English websites and PDFs that i will give to you related to the course topic: {topic}.",
13
+ "Focus on content that match the course domain ({domain}), content type ({content_type}), "
14
  "and audience ({audience}).",
15
  "Prioritize materials that can serve as strong foundations for creating {material_type} "
16
  "learning materials (conceptual, structural, procedural, and real-world).",
 
56
  " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
57
  "",
58
  "Ensure no important content, examples, or explanations are omitted from extraction.",
59
+ "Output will be a json format with no task output or raw data only the formatted json dictionary.",
60
  ]
61
  ),
62
  expected_output=(
agents/design_phase/scraper/extractor_crawlee.py CHANGED
@@ -12,9 +12,9 @@
12
  # goal="\n".join(
13
  # [
14
  # "Collect and extract complete, structured, and educationally valuable content "
15
- # "from Arabic and English websites and PDFs related to the course topic: {topic}.",
16
- # "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
17
- # "and audience ({audience}).",
18
  # "Prioritize materials that can serve as strong foundations for creating {material_type} "
19
  # "learning materials (conceptual, structural, procedural, and real-world).",
20
  # "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
@@ -59,6 +59,8 @@
59
  # " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
60
  # "",
61
  # "Ensure no important content, examples, or explanations are omitted from extraction.",
 
 
62
  # ]
63
  # ),
64
  # expected_output=(
 
12
  # goal="\n".join(
13
  # [
14
  # "Collect and extract complete, structured, and educationally valuable content "
15
+ # "from Arabic and English websites and PDFs that i will give to you related to the course topic: {topic}.",
16
+ # "Focus on content that match the course domain ({domain}), content type ({content_type}), "
17
+ # "and audience ({audience}).",
18
  # "Prioritize materials that can serve as strong foundations for creating {material_type} "
19
  # "learning materials (conceptual, structural, procedural, and real-world).",
20
  # "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
 
59
  # " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
60
  # "",
61
  # "Ensure no important content, examples, or explanations are omitted from extraction.",
62
+ # "Output will be a json format with no task output or raw data only the formatted json dictionary.",
63
+
64
  # ]
65
  # ),
66
  # expected_output=(
agents/design_phase/source_finder.py CHANGED
@@ -19,7 +19,7 @@ search_engine_agent = Agent(
19
  "Include Arabic sources when contextually valuable, but prioritize English academic sources for broader coverage.",
20
  "Ensure all results are concise, relevant, and aligned with the outline headings to directly support course design and educational content.",
21
  "For each query, return at most {no_links} results with structured metadata (title, url, content summary, score, search_query).",
22
- "Validate URLs before using them",
23
  ]
24
  ),
25
  backstory=(
@@ -54,8 +54,9 @@ search_engine_task = Task(
54
  "The final results will serve as the knowledge base for building trusted, high-quality content, ",
55
  "with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
56
  "with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
57
- "Validate URLs before using them",
58
  "Get sites that newer than 2021.",
 
 
59
  ]
60
  ),
61
  expected_output=(
 
19
  "Include Arabic sources when contextually valuable, but prioritize English academic sources for broader coverage.",
20
  "Ensure all results are concise, relevant, and aligned with the outline headings to directly support course design and educational content.",
21
  "For each query, return at most {no_links} results with structured metadata (title, url, content summary, score, search_query).",
22
+ "Validate URLs before retrieving them and if it is not valid dont retieve",
23
  ]
24
  ),
25
  backstory=(
 
54
  "The final results will serve as the knowledge base for building trusted, high-quality content, ",
55
  "with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
56
  "with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
 
57
  "Get sites that newer than 2021.",
58
+ "Validate URLs before retrieving them and if it is not valid dont retieve",
59
+
60
  ]
61
  ),
62
  expected_output=(
my_file.json ADDED
The diff for this file is too large to render. See raw diff
 
my_file_bs4.json ADDED
The diff for this file is too large to render. See raw diff
 
my_file_bs4_agent.json ADDED
The diff for this file is too large to render. See raw diff
 
routers/scraper_route.py CHANGED
@@ -341,9 +341,7 @@ async def process_json_scrape(request: Request, file: UploadFile, data: str, mod
341
  "unit_title": unit_title,
342
  "subtopic_title": subtopic_title,
343
  "query": query,
344
- "url": url,
345
- "metadata": metadata,
346
- "scraped": scraped,
347
  }
348
  )
349
 
 
341
  "unit_title": unit_title,
342
  "subtopic_title": subtopic_title,
343
  "query": query,
344
+ "parts": scraped,
 
 
345
  }
346
  )
347
 
routers/source_route.py CHANGED
@@ -64,24 +64,23 @@ async def run_training(request: Request, file: UploadFile, data: str = Form(...)
64
  subtopic_title = unit["subtopic_title"]
65
  queries = unit["queries"]
66
 
67
- for query in queries:
68
- print(f"🔍 Running search for [{subtopic_title}] | Query: {query}")
69
-
70
- merged_input = {
71
- **user_inputs,
72
- "score_th": 0.6,
73
- "no_links": 3,
74
- "queries": query,
75
- "unit_title": unit_title,
76
- "subtopic_title": subtopic_title,
77
- "TRUSTED_SITES": TRUSTED_SITES,
78
- }
79
-
80
- try:
81
- result = crew.kickoff(inputs=merged_input)
82
- all_results.append(result.json_dict)
83
- except Exception as e:
84
- print(f"⚠️ Error while running query '{query}': {e}")
85
  output_data = {"results": all_results}
86
  # ✅ Save results to file
87
  output_file = f"/tmp/search_results"
 
64
  subtopic_title = unit["subtopic_title"]
65
  queries = unit["queries"]
66
 
67
+ print(f"🔍 Running search for [{subtopic_title}] | Query: {queries}")
68
+
69
+ merged_input = {
70
+ **user_inputs,
71
+ "score_th": 0.6,
72
+ "no_links": 3,
73
+ "queries": queries,
74
+ "unit_title": unit_title,
75
+ "subtopic_title": subtopic_title,
76
+ "TRUSTED_SITES": TRUSTED_SITES,
77
+ }
78
+
79
+ try:
80
+ result = crew.kickoff(inputs=merged_input)
81
+ all_results.append(result.json_dict)
82
+ except Exception as e:
83
+ print(f"⚠️ Error while running query '{queries}': {e}")
 
84
  output_data = {"results": all_results}
85
  # ✅ Save results to file
86
  output_file = f"/tmp/search_results"
schemas/inputs_schema.py CHANGED
@@ -27,3 +27,13 @@ class OutlineInput(BaseModel):
27
  # "audience": "القادة والموظفون والإداريون",
28
  # "material_type": ["مفاهمية", "هيكلية", "شخصية", "واقعية", "اجرائية"],
29
  # }
 
 
 
 
 
 
 
 
 
 
 
27
  # "audience": "القادة والموظفون والإداريون",
28
  # "material_type": ["مفاهمية", "هيكلية", "شخصية", "واقعية", "اجرائية"],
29
  # }
30
+
31
+
32
+ # {
33
+ # "topic": "ريادة الأعمال",
34
+ # "domain": "Management, Business",
35
+ # "content_type": "Awareness, Training",
36
+ # "audience": "CEOs, Directors, Entrepreneurs",
37
+ # "material_type": ["مفاهمية", "هيكلية", "اجرائية", "واقعية"],
38
+ # "units_number": 2,
39
+ # }
tools/scraper/no_agent/bs4_scraper.py CHANGED
@@ -13,9 +13,29 @@ def scrape_with_bs4(url: str) -> dict:
13
  response = requests.get(url, timeout=10)
14
  soup = BeautifulSoup(response.text, "html.parser")
15
 
16
- text = soup.get_text(separator="\n").strip()
 
17
 
18
- return {"url": url, "content": text}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  except Exception as e:
21
  return {"url": url, "error": str(e)}
 
13
  response = requests.get(url, timeout=10)
14
  soup = BeautifulSoup(response.text, "html.parser")
15
 
16
+ # text = soup.get_text(separator="\n").strip()
17
+ # imgs= soup.find_all
18
 
19
+ title = soup.title.string.strip() if soup.title else "No title"
20
+ text = " ".join([p.get_text(strip=True) for p in soup.find_all("p")])
21
+ img_urls = [img["src"] for img in soup.find_all("img", src=True)]
22
+ video_urls = [vid["src"] for vid in soup.find_all("video", src=True)]
23
+ audio_urls = [aud["src"] for aud in soup.find_all("audio", src=True)]
24
+ pdf_urls = [
25
+ a["href"]
26
+ for a in soup.find_all("a", href=True)
27
+ if a["href"].endswith(".pdf")
28
+ ]
29
+
30
+ return {
31
+ "page_url": url,
32
+ "title": title,
33
+ "content": text,
34
+ "img_url": img_urls,
35
+ "video_url": video_urls,
36
+ "audio_url": audio_urls,
37
+ "pdf_url": pdf_urls,
38
+ }
39
 
40
  except Exception as e:
41
  return {"url": url, "error": str(e)}
tools/validate_url.py CHANGED
@@ -10,7 +10,7 @@ class URLValidatorTool(BaseTool):
10
  def _run(self, url: str) -> dict:
11
  """Synchronous tool execution"""
12
  try:
13
- response = requests.head(url, allow_redirects=True, timeout=5)
14
  return {
15
  "url": url,
16
  "status_code": response.status_code,
 
10
  def _run(self, url: str) -> dict:
11
  """Synchronous tool execution"""
12
  try:
13
+ response = requests.head(url, allow_redirects=True, timeout=15)
14
  return {
15
  "url": url,
16
  "status_code": response.status_code,