Spaces:

ziadsameh32
/

ContiAI-v3

Sleeping

App Files Files Community

ziadsameh32 commited on Nov 26, 2025

Commit

200825a

1 Parent(s): 05635a4

scraper

Browse files

Files changed (12) hide show

agents/design_phase/scraper/extractor_bs4.py +4 -2
agents/design_phase/scraper/extractor_bulit_in.py +3 -2
agents/design_phase/scraper/extractor_crawlee.py +5 -3
agents/design_phase/source_finder.py +3 -2
my_file.json +0 -0
my_file_bs4.json +0 -0
my_file_bs4_agent.json +0 -0
routers/scraper_route.py +1 -3
routers/source_route.py +17 -18
schemas/inputs_schema.py +10 -0
tools/scraper/no_agent/bs4_scraper.py +22 -2
tools/validate_url.py +1 -1

agents/design_phase/scraper/extractor_bs4.py CHANGED Viewed

@@ -12,8 +12,8 @@ scraping_bs4_agent = Agent(
     goal="\n".join(
         [
             "Collect and extract complete, structured, and educationally valuable content "
-            "from Arabic and English websites and PDFs related to the course topic: {topic}.",
-            "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
             "and audience ({audience}).",
             "Prioritize materials that can serve as strong foundations for creating {material_type} "
             "learning materials (conceptual, structural, procedural, and real-world).",
@@ -59,6 +59,7 @@ scraping_bs4_task = Task(
             "  - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
             "",
             "Ensure no important content, examples, or explanations are omitted from extraction.",
         ]
     ),
     expected_output=(
@@ -88,6 +89,7 @@ scraping_bs4_task = Task(
         "Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
         "Ensure valid JSON syntax with no unterminated strings or extra text.\n"
         "Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
     ),
     agent=scraping_bs4_agent,
     output_json=UnitSubtopicOutputModel,

     goal="\n".join(
         [
             "Collect and extract complete, structured, and educationally valuable content "
+            "from Arabic and English websites and PDFs that i will give to you related to the course topic: {topic}.",
+            "Focus on content that match the course domain ({domain}), content type ({content_type}), "
             "and audience ({audience}).",
             "Prioritize materials that can serve as strong foundations for creating {material_type} "
             "learning materials (conceptual, structural, procedural, and real-world).",
             "  - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
             "",
             "Ensure no important content, examples, or explanations are omitted from extraction.",
+            "Output will be a json format with no task output or raw data only the formatted json dictionary.",
         ]
     ),
     expected_output=(
         "Make the output compatible with Python's ast library (use r1 = result.dict()['raw']; f_result = ast.literal_eval(r1)).\n"
         "Ensure valid JSON syntax with no unterminated strings or extra text.\n"
         "Output ONLY the dictionary — no thoughts, explanations, or markdown formatting."
+        "D"
     ),
     agent=scraping_bs4_agent,
     output_json=UnitSubtopicOutputModel,

agents/design_phase/scraper/extractor_bulit_in.py CHANGED Viewed

@@ -9,8 +9,8 @@ scraping_built_in_agent = Agent(
     goal="\n".join(
         [
             "Collect and extract complete, structured, and educationally valuable content "
-            "from Arabic and English websites and PDFs related to the course topic: {topic}.",
-            "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
             "and audience ({audience}).",
             "Prioritize materials that can serve as strong foundations for creating {material_type} "
             "learning materials (conceptual, structural, procedural, and real-world).",
@@ -56,6 +56,7 @@ scraping_built_in_task = Task(
             "  - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
             "",
             "Ensure no important content, examples, or explanations are omitted from extraction.",
         ]
     ),
     expected_output=(

     goal="\n".join(
         [
             "Collect and extract complete, structured, and educationally valuable content "
+            "from Arabic and English websites and PDFs that i will give to you related to the course topic: {topic}.",
+            "Focus on content that match the course domain ({domain}), content type ({content_type}), "
             "and audience ({audience}).",
             "Prioritize materials that can serve as strong foundations for creating {material_type} "
             "learning materials (conceptual, structural, procedural, and real-world).",
             "  - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
             "",
             "Ensure no important content, examples, or explanations are omitted from extraction.",
+            "Output will be a json format with no task output or raw data only the formatted json dictionary.",
         ]
     ),
     expected_output=(

agents/design_phase/scraper/extractor_crawlee.py CHANGED Viewed

@@ -12,9 +12,9 @@
 #     goal="\n".join(
 #         [
 #             "Collect and extract complete, structured, and educationally valuable content "
-#             "from Arabic and English websites and PDFs related to the course topic: {topic}.",
-#             "Focus on sources that match the course domain ({domain}), content type ({content_type}), "
-#             "and audience ({audience}).",
 #             "Prioritize materials that can serve as strong foundations for creating {material_type} "
 #             "learning materials (conceptual, structural, procedural, and real-world).",
 #             "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
@@ -59,6 +59,8 @@
 #             "  - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
 #             "",
 #             "Ensure no important content, examples, or explanations are omitted from extraction.",
 #         ]
 #     ),
 #     expected_output=(

 #     goal="\n".join(
 #         [
 #             "Collect and extract complete, structured, and educationally valuable content "
+#             "from Arabic and English websites and PDFs that i will give to you related to the course topic: {topic}.",
+            # "Focus on content that match the course domain ({domain}), content type ({content_type}), "
+            # "and audience ({audience}).",
 #             "Prioritize materials that can serve as strong foundations for creating {material_type} "
 #             "learning materials (conceptual, structural, procedural, and real-world).",
 #             "Extract full text including all sections, examples, and details, ensuring high accuracy for Arabic text.",
 #             "  - Provide short expert notes justifying the ranking and explaining how the content can contribute to the course design.",
 #             "",
 #             "Ensure no important content, examples, or explanations are omitted from extraction.",
+            # "Output will be a json format with no task output or raw data only the formatted json dictionary.",
 #         ]
 #     ),
 #     expected_output=(

agents/design_phase/source_finder.py CHANGED Viewed

@@ -19,7 +19,7 @@ search_engine_agent = Agent(
             "Include Arabic sources when contextually valuable, but prioritize English academic sources for broader coverage.",
             "Ensure all results are concise, relevant, and aligned with the outline headings to directly support course design and educational content.",
             "For each query, return at most {no_links} results with structured metadata (title, url, content summary, score, search_query).",
-            "Validate URLs before using them",
         ]
     ),
     backstory=(
@@ -54,8 +54,9 @@ search_engine_task = Task(
             "The final results will serve as the knowledge base for building trusted, high-quality content, ",
             "with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
             "with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
-            "Validate URLs before using them",
             "Get sites that newer than 2021.",
         ]
     ),
     expected_output=(

             "Include Arabic sources when contextually valuable, but prioritize English academic sources for broader coverage.",
             "Ensure all results are concise, relevant, and aligned with the outline headings to directly support course design and educational content.",
             "For each query, return at most {no_links} results with structured metadata (title, url, content summary, score, search_query).",
+            "Validate URLs before retrieving them and if it is not valid dont retieve",
         ]
     ),
     backstory=(
             "The final results will serve as the knowledge base for building trusted, high-quality content, ",
             "with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
             "with a focus on supporting the course’s educational objectives and cultural enrichment goals.",
             "Get sites that newer than 2021.",
+            "Validate URLs before retrieving them and if it is not valid dont retieve",
         ]
     ),
     expected_output=(

my_file.json ADDED Viewed

The diff for this file is too large to render. See raw diff

my_file_bs4.json ADDED Viewed

The diff for this file is too large to render. See raw diff

my_file_bs4_agent.json ADDED Viewed

The diff for this file is too large to render. See raw diff

routers/scraper_route.py CHANGED Viewed

@@ -341,9 +341,7 @@ async def process_json_scrape(request: Request, file: UploadFile, data: str, mod
                         "unit_title": unit_title,
                         "subtopic_title": subtopic_title,
                         "query": query,
-                        "url": url,
-                        "metadata": metadata,
-                        "scraped": scraped,
                     }
                 )

                         "unit_title": unit_title,
                         "subtopic_title": subtopic_title,
                         "query": query,
+                        "parts": scraped,
                     }
                 )

routers/source_route.py CHANGED Viewed

@@ -64,24 +64,23 @@ async def run_training(request: Request, file: UploadFile, data: str = Form(...)
         subtopic_title = unit["subtopic_title"]
         queries = unit["queries"]
-        for query in queries:
-            print(f"🔍 Running search for [{subtopic_title}] | Query: {query}")
-            merged_input = {
-                **user_inputs,
-                "score_th": 0.6,
-                "no_links": 3,
-                "queries": query,
-                "unit_title": unit_title,
-                "subtopic_title": subtopic_title,
-                "TRUSTED_SITES": TRUSTED_SITES,
-            }
-            try:
-                result = crew.kickoff(inputs=merged_input)
-                all_results.append(result.json_dict)
-            except Exception as e:
-                print(f"⚠️ Error while running query '{query}': {e}")
     output_data = {"results": all_results}
     # ✅ Save results to file
     output_file = f"/tmp/search_results"

         subtopic_title = unit["subtopic_title"]
         queries = unit["queries"]
+        print(f"🔍 Running search for [{subtopic_title}] | Query: {queries}")
+        merged_input = {
+            **user_inputs,
+            "score_th": 0.6,
+            "no_links": 3,
+            "queries": queries,
+            "unit_title": unit_title,
+            "subtopic_title": subtopic_title,
+            "TRUSTED_SITES": TRUSTED_SITES,
+        }
+        try:
+            result = crew.kickoff(inputs=merged_input)
+            all_results.append(result.json_dict)
+        except Exception as e:
+            print(f"⚠️ Error while running query '{queries}': {e}")
     output_data = {"results": all_results}
     # ✅ Save results to file
     output_file = f"/tmp/search_results"

schemas/inputs_schema.py CHANGED Viewed

@@ -27,3 +27,13 @@ class OutlineInput(BaseModel):
 #     "audience": "القادة والموظفون والإداريون",
 #     "material_type": ["مفاهمية", "هيكلية", "شخصية", "واقعية", "اجرائية"],
 # }

 #     "audience": "القادة والموظفون والإداريون",
 #     "material_type": ["مفاهمية", "هيكلية", "شخصية", "واقعية", "اجرائية"],
 # }
+# {
+#     "topic": "ريادة الأعمال",
+#     "domain": "Management, Business",
+#     "content_type": "Awareness, Training",
+#     "audience": "CEOs, Directors, Entrepreneurs",
+#     "material_type": ["مفاهمية", "هيكلية", "اجرائية", "واقعية"],
+#     "units_number": 2,
+# }

tools/scraper/no_agent/bs4_scraper.py CHANGED Viewed

@@ -13,9 +13,29 @@ def scrape_with_bs4(url: str) -> dict:
         response = requests.get(url, timeout=10)
         soup = BeautifulSoup(response.text, "html.parser")
-        text = soup.get_text(separator="\n").strip()
-        return {"url": url, "content": text}
     except Exception as e:
         return {"url": url, "error": str(e)}

         response = requests.get(url, timeout=10)
         soup = BeautifulSoup(response.text, "html.parser")
+        # text = soup.get_text(separator="\n").strip()
+        # imgs= soup.find_all
+        title = soup.title.string.strip() if soup.title else "No title"
+        text = " ".join([p.get_text(strip=True) for p in soup.find_all("p")])
+        img_urls = [img["src"] for img in soup.find_all("img", src=True)]
+        video_urls = [vid["src"] for vid in soup.find_all("video", src=True)]
+        audio_urls = [aud["src"] for aud in soup.find_all("audio", src=True)]
+        pdf_urls = [
+            a["href"]
+            for a in soup.find_all("a", href=True)
+            if a["href"].endswith(".pdf")
+        ]
+        return {
+            "page_url": url,
+            "title": title,
+            "content": text,
+            "img_url": img_urls,
+            "video_url": video_urls,
+            "audio_url": audio_urls,
+            "pdf_url": pdf_urls,
+        }
     except Exception as e:
         return {"url": url, "error": str(e)}

tools/validate_url.py CHANGED Viewed

@@ -10,7 +10,7 @@ class URLValidatorTool(BaseTool):
     def _run(self, url: str) -> dict:
         """Synchronous tool execution"""
         try:
-            response = requests.head(url, allow_redirects=True, timeout=5)
             return {
                 "url": url,
                 "status_code": response.status_code,

     def _run(self, url: str) -> dict:
         """Synchronous tool execution"""
         try:
+            response = requests.head(url, allow_redirects=True, timeout=15)
             return {
                 "url": url,
                 "status_code": response.status_code,