Spaces:
Sleeping
Sleeping
Commit ·
200825a
1
Parent(s): 05635a4
scraper
Browse files- agents/design_phase/scraper/extractor_bs4.py +4 -2
- agents/design_phase/scraper/extractor_bulit_in.py +3 -2
- agents/design_phase/scraper/extractor_crawlee.py +5 -3
- agents/design_phase/source_finder.py +3 -2
- my_file.json +0 -0
- my_file_bs4.json +0 -0
- my_file_bs4_agent.json +0 -0
- routers/scraper_route.py +1 -3
- routers/source_route.py +17 -18
- schemas/inputs_schema.py +10 -0
- tools/scraper/no_agent/bs4_scraper.py +22 -2
- tools/validate_url.py +1 -1
agents/design_phase/scraper/extractor_bs4.py
CHANGED
|
@@ -12,8 +12,8 @@ scraping_bs4_agent = Agent(
|
|
| 12 |
goal="\n".join(
|
| 13 |
[
|
| 14 |
"Collect and extract complete, structured, and educationally valuable content "
|
| 15 |
-
"from Arabic and English websites and PDFs related to the course topic: {topic}.",
|
| 16 |
-
"Focus on
|
| 17 |
"and audience ({audience}).",
|
| 18 |
"Prioritize materials that can serve as strong foundations for creating {material_type} "
|
| 19 |
"learning materials (conceptual, structural, procedural, and real-world).",
|
|
@@ -59,6 +59,7 @@ scraping_bs4_task = Task(
|
|
| 59 |
" - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
|
| 60 |
"",
|
| 61 |
"Ensure no important content, examples, or explanations are omitted from extraction.",
|
|
|
|
| 62 |
]
|
| 63 |
),
|
| 64 |
expected_output=(
|
|
@@ -88,6 +89,7 @@ scraping_bs4_task = Task(
|
|
| 88 |
"Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
|
| 89 |
"Ensure valid JSON syntax with no unterminated strings or extra text.\n"
|
| 90 |
"Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
|
|
|
|
| 91 |
),
|
| 92 |
agent=scraping_bs4_agent,
|
| 93 |
output_json=UnitSubtopicOutputModel,
|
|
|
|
| 12 |
goal="\n".join(
|
| 13 |
[
|
| 14 |
"Collect and extract complete, structured, and educationally valuable content "
|
| 15 |
+
"from Arabic and English websites and PDFs that i will give to you related to the course topic: {topic}.",
|
| 16 |
+
"Focus on content that match the course domain ({domain}), content type ({content_type}), "
|
| 17 |
"and audience ({audience}).",
|
| 18 |
"Prioritize materials that can serve as strong foundations for creating {material_type} "
|
| 19 |
"learning materials (conceptual, structural, procedural, and real-world).",
|
|
|
|
| 59 |
" - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
|
| 60 |
"",
|
| 61 |
"Ensure no important content, examples, or explanations are omitted from extraction.",
|
| 62 |
+
"Output will be a json format with no task output or raw data only the formatted json dictionary.",
|
| 63 |
]
|
| 64 |
),
|
| 65 |
expected_output=(
|
|
|
|
| 89 |
"Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
|
| 90 |
"Ensure valid JSON syntax with no unterminated strings or extra text.\n"
|
| 91 |
"Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
|
| 92 |
+
"D"
|
| 93 |
),
|
| 94 |
agent=scraping_bs4_agent,
|
| 95 |
output_json=UnitSubtopicOutputModel,
|
agents/design_phase/scraper/extractor_bulit_in.py
CHANGED
|
@@ -9,8 +9,8 @@ scraping_built_in_agent = Agent(
|
|
| 9 |
goal="\n".join(
|
| 10 |
[
|
| 11 |
"Collect and extract complete, structured, and educationally valuable content "
|
| 12 |
-
"from Arabic and English websites and PDFs related to the course topic: {topic}.",
|
| 13 |
-
"Focus on
|
| 14 |
"and audience ({audience}).",
|
| 15 |
"Prioritize materials that can serve as strong foundations for creating {material_type} "
|
| 16 |
"learning materials (conceptual, structural, procedural, and real-world).",
|
|
@@ -56,6 +56,7 @@ scraping_built_in_task = Task(
|
|
| 56 |
" - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
|
| 57 |
"",
|
| 58 |
"Ensure no important content, examples, or explanations are omitted from extraction.",
|
|
|
|
| 59 |
]
|
| 60 |
),
|
| 61 |
expected_output=(
|
|
|
|
| 9 |
goal="\n".join(
|
| 10 |
[
|
| 11 |
"Collect and extract complete, structured, and educationally valuable content "
|
| 12 |
+
"from Arabic and English websites and PDFs that i will give to you related to the course topic: {topic}.",
|
| 13 |
+
"Focus on content that match the course domain ({domain}), content type ({content_type}), "
|
| 14 |
"and audience ({audience}).",
|
| 15 |
"Prioritize materials that can serve as strong foundations for creating {material_type} "
|
| 16 |
"learning materials (conceptual, structural, procedural, and real-world).",
|
|
|
|
| 56 |
" - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
|
| 57 |
"",
|
| 58 |
"Ensure no important content, examples, or explanations are omitted from extraction.",
|
| 59 |
+
"Output will be a json format with no task output or raw data only the formatted json dictionary.",
|
| 60 |
]
|
| 61 |
),
|
| 62 |
expected_output=(
|
agents/design_phase/scraper/extractor_crawlee.py
CHANGED
|
@@ -12,9 +12,9 @@
|
|
| 12 |
# goal="\n".join(
|
| 13 |
# [
|
| 14 |
# "Collect and extract complete, structured, and educationally valuable content "
|
| 15 |
-
# "from Arabic and English websites and PDFs related to the course topic: {topic}.",
|
| 16 |
-
#
|
| 17 |
-
#
|
| 18 |
# "Prioritize materials that can serve as strong foundations for creating {material_type} "
|
| 19 |
# "learning materials (conceptual, structural, procedural, and real-world).",
|
| 20 |
# "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
|
|
@@ -59,6 +59,8 @@
|
|
| 59 |
# " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
|
| 60 |
# "",
|
| 61 |
# "Ensure no important content, examples, or explanations are omitted from extraction.",
|
|
|
|
|
|
|
| 62 |
# ]
|
| 63 |
# ),
|
| 64 |
# expected_output=(
|
|
|
|
| 12 |
# goal="\n".join(
|
| 13 |
# [
|
| 14 |
# "Collect and extract complete, structured, and educationally valuable content "
|
| 15 |
+
# "from Arabic and English websites and PDFs that i will give to you related to the course topic: {topic}.",
|
| 16 |
+
# "Focus on content that match the course domain ({domain}), content type ({content_type}), "
|
| 17 |
+
# "and audience ({audience}).",
|
| 18 |
# "Prioritize materials that can serve as strong foundations for creating {material_type} "
|
| 19 |
# "learning materials (conceptual, structural, procedural, and real-world).",
|
| 20 |
# "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
|
|
|
|
| 59 |
# " - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
|
| 60 |
# "",
|
| 61 |
# "Ensure no important content, examples, or explanations are omitted from extraction.",
|
| 62 |
+
# "Output will be a json format with no task output or raw data only the formatted json dictionary.",
|
| 63 |
+
|
| 64 |
# ]
|
| 65 |
# ),
|
| 66 |
# expected_output=(
|
agents/design_phase/source_finder.py
CHANGED
|
@@ -19,7 +19,7 @@ search_engine_agent = Agent(
|
|
| 19 |
"Include Arabic sources when contextually valuable, but prioritize English academic sources for broader coverage.",
|
| 20 |
"Ensure all results are concise, relevant, and aligned with the outline headings to directly support course design and educational content.",
|
| 21 |
"For each query, return at most {no_links} results with structured metadata (title, url, content summary, score, search_query).",
|
| 22 |
-
"Validate URLs before
|
| 23 |
]
|
| 24 |
),
|
| 25 |
backstory=(
|
|
@@ -54,8 +54,9 @@ search_engine_task = Task(
|
|
| 54 |
"The final results will serve as the knowledge base for building trusted, high-quality content, ",
|
| 55 |
"with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
|
| 56 |
"with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
|
| 57 |
-
"Validate URLs before using them",
|
| 58 |
"Get sites that newer than 2021.",
|
|
|
|
|
|
|
| 59 |
]
|
| 60 |
),
|
| 61 |
expected_output=(
|
|
|
|
| 19 |
"Include Arabic sources when contextually valuable, but prioritize English academic sources for broader coverage.",
|
| 20 |
"Ensure all results are concise, relevant, and aligned with the outline headings to directly support course design and educational content.",
|
| 21 |
"For each query, return at most {no_links} results with structured metadata (title, url, content summary, score, search_query).",
|
| 22 |
+
"Validate URLs before retrieving them and if it is not valid dont retieve",
|
| 23 |
]
|
| 24 |
),
|
| 25 |
backstory=(
|
|
|
|
| 54 |
"The final results will serve as the knowledge base for building trusted, high-quality content, ",
|
| 55 |
"with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
|
| 56 |
"with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
|
|
|
|
| 57 |
"Get sites that newer than 2021.",
|
| 58 |
+
"Validate URLs before retrieving them and if it is not valid dont retieve",
|
| 59 |
+
|
| 60 |
]
|
| 61 |
),
|
| 62 |
expected_output=(
|
my_file.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
my_file_bs4.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
my_file_bs4_agent.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
routers/scraper_route.py
CHANGED
|
@@ -341,9 +341,7 @@ async def process_json_scrape(request: Request, file: UploadFile, data: str, mod
|
|
| 341 |
"unit_title": unit_title,
|
| 342 |
"subtopic_title": subtopic_title,
|
| 343 |
"query": query,
|
| 344 |
-
"
|
| 345 |
-
"metadata": metadata,
|
| 346 |
-
"scraped": scraped,
|
| 347 |
}
|
| 348 |
)
|
| 349 |
|
|
|
|
| 341 |
"unit_title": unit_title,
|
| 342 |
"subtopic_title": subtopic_title,
|
| 343 |
"query": query,
|
| 344 |
+
"parts": scraped,
|
|
|
|
|
|
|
| 345 |
}
|
| 346 |
)
|
| 347 |
|
routers/source_route.py
CHANGED
|
@@ -64,24 +64,23 @@ async def run_training(request: Request, file: UploadFile, data: str = Form(...)
|
|
| 64 |
subtopic_title = unit["subtopic_title"]
|
| 65 |
queries = unit["queries"]
|
| 66 |
|
| 67 |
-
for
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
print(f"⚠️ Error while running query '{query}': {e}")
|
| 85 |
output_data = {"results": all_results}
|
| 86 |
# ✅ Save results to file
|
| 87 |
output_file = f"/tmp/search_results"
|
|
|
|
| 64 |
subtopic_title = unit["subtopic_title"]
|
| 65 |
queries = unit["queries"]
|
| 66 |
|
| 67 |
+
print(f"🔍 Running search for [{subtopic_title}] | Query: {queries}")
|
| 68 |
+
|
| 69 |
+
merged_input = {
|
| 70 |
+
**user_inputs,
|
| 71 |
+
"score_th": 0.6,
|
| 72 |
+
"no_links": 3,
|
| 73 |
+
"queries": queries,
|
| 74 |
+
"unit_title": unit_title,
|
| 75 |
+
"subtopic_title": subtopic_title,
|
| 76 |
+
"TRUSTED_SITES": TRUSTED_SITES,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
result = crew.kickoff(inputs=merged_input)
|
| 81 |
+
all_results.append(result.json_dict)
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"⚠️ Error while running query '{queries}': {e}")
|
|
|
|
| 84 |
output_data = {"results": all_results}
|
| 85 |
# ✅ Save results to file
|
| 86 |
output_file = f"/tmp/search_results"
|
schemas/inputs_schema.py
CHANGED
|
@@ -27,3 +27,13 @@ class OutlineInput(BaseModel):
|
|
| 27 |
# "audience": "القادة والموظفون والإداريون",
|
| 28 |
# "material_type": ["مفاهمية", "هيكلية", "شخصية", "واقعية", "اجرائية"],
|
| 29 |
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# "audience": "القادة والموظفون والإداريون",
|
| 28 |
# "material_type": ["مفاهمية", "هيكلية", "شخصية", "واقعية", "اجرائية"],
|
| 29 |
# }
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# {
|
| 33 |
+
# "topic": "ريادة الأعمال",
|
| 34 |
+
# "domain": "Management, Business",
|
| 35 |
+
# "content_type": "Awareness, Training",
|
| 36 |
+
# "audience": "CEOs, Directors, Entrepreneurs",
|
| 37 |
+
# "material_type": ["مفاهمية", "هيكلية", "اجرائية", "واقعية"],
|
| 38 |
+
# "units_number": 2,
|
| 39 |
+
# }
|
tools/scraper/no_agent/bs4_scraper.py
CHANGED
|
@@ -13,9 +13,29 @@ def scrape_with_bs4(url: str) -> dict:
|
|
| 13 |
response = requests.get(url, timeout=10)
|
| 14 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 15 |
|
| 16 |
-
text = soup.get_text(separator="\n").strip()
|
|
|
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
except Exception as e:
|
| 21 |
return {"url": url, "error": str(e)}
|
|
|
|
| 13 |
response = requests.get(url, timeout=10)
|
| 14 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 15 |
|
| 16 |
+
# text = soup.get_text(separator="\n").strip()
|
| 17 |
+
# imgs= soup.find_all
|
| 18 |
|
| 19 |
+
title = soup.title.string.strip() if soup.title else "No title"
|
| 20 |
+
text = " ".join([p.get_text(strip=True) for p in soup.find_all("p")])
|
| 21 |
+
img_urls = [img["src"] for img in soup.find_all("img", src=True)]
|
| 22 |
+
video_urls = [vid["src"] for vid in soup.find_all("video", src=True)]
|
| 23 |
+
audio_urls = [aud["src"] for aud in soup.find_all("audio", src=True)]
|
| 24 |
+
pdf_urls = [
|
| 25 |
+
a["href"]
|
| 26 |
+
for a in soup.find_all("a", href=True)
|
| 27 |
+
if a["href"].endswith(".pdf")
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
return {
|
| 31 |
+
"page_url": url,
|
| 32 |
+
"title": title,
|
| 33 |
+
"content": text,
|
| 34 |
+
"img_url": img_urls,
|
| 35 |
+
"video_url": video_urls,
|
| 36 |
+
"audio_url": audio_urls,
|
| 37 |
+
"pdf_url": pdf_urls,
|
| 38 |
+
}
|
| 39 |
|
| 40 |
except Exception as e:
|
| 41 |
return {"url": url, "error": str(e)}
|
tools/validate_url.py
CHANGED
|
@@ -10,7 +10,7 @@ class URLValidatorTool(BaseTool):
|
|
| 10 |
def _run(self, url: str) -> dict:
|
| 11 |
"""Synchronous tool execution"""
|
| 12 |
try:
|
| 13 |
-
response = requests.head(url, allow_redirects=True, timeout=
|
| 14 |
return {
|
| 15 |
"url": url,
|
| 16 |
"status_code": response.status_code,
|
|
|
|
| 10 |
def _run(self, url: str) -> dict:
|
| 11 |
"""Synchronous tool execution"""
|
| 12 |
try:
|
| 13 |
+
response = requests.head(url, allow_redirects=True, timeout=15)
|
| 14 |
return {
|
| 15 |
"url": url,
|
| 16 |
"status_code": response.status_code,
|