Craw4ai-example

Sleeping

App Files Files Community

rairo commited on Apr 7, 2025

Commit

605e112

verified ·

1 Parent(s): 0eb897b

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -45

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import subprocess
 import io
 import time
 import urllib.parse
 from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.text_splitter import CharacterTextSplitter
@@ -20,6 +21,9 @@ from langchain_community.document_loaders import PlaywrightURLLoader
 import requests
 # Import Supadata and initialize the client
 from supadata import Supadata, SupadataError
 SUPADATA_API_KEY = os.getenv("SUPADATA")
 supadata = Supadata(api_key=SUPADATA_API_KEY)
@@ -39,6 +43,7 @@ graph_config = {
     "headless": True
 }
 def get_data(search_term):
     """
     Run the SearchGraph for a given search term.
@@ -94,56 +99,98 @@ def get_data(search_term):
 SUPADATA_API_KEY = os.getenv("SUPADATA")
-def get_data_from_url(url):
-    loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
-    data = loader.aload()
-    test_s = data
     """
-    Scrape the provided URL using Supadata. If it fails, fall back to the Supadata API,
-    and if that fails, fall back to a direct request. Extract grant data using Gemini AI.
     """
     page_content = None  # Placeholder for storing scraped page content
-    # **Step 1: Attempt Supadata's Built-in Scraper**
-    try:
-        web_content = supadata.web.scrape(url)
-        page_content = web_content.content
-    except TypeError as te:
-        if "unexpected keyword argument 'type'" in str(te):
-            st.warning("Falling back to Supadata API due to unexpected keyword 'type' error.")
-        else:
-            st.error(f"Unexpected error in Supadata scrape: {te}, {test_s}")
-    # **Step 2: If Supadata's Built-in Scraper Fails, Use Supadata API**
-    if not page_content:
         try:
-            api_url = "https://api.supadata.ai/v1/web/scrape"
-            headers = {"X-API-Key": SUPADATA_API_KEY}
-            response = requests.get(api_url, headers=headers, params={"url": url})
-            if response.status_code == 200:
-                page_content = response.json().get("content", "")
-            else:
-                st.error(f"Supadata API failed with status {response.status_code} data: {test_s} ")
         except Exception as e:
-            st.error(f"Error calling Supadata API: {e}, data: {test_s}")
-    # **Step 3: If Supadata API Fails, Use Direct Web Request**
-    if not page_content:
         try:
-            r = requests.get(url, timeout=10)
-            if r.status_code == 200:
-                page_content = r.text
-                st.success(f"{test_s}")
             else:
-                st.error(f"Manual scraping failed with status code {r.status_code}, data:{test_s}")
                 return {}
-        except Exception as e:
-            st.error(f"Manual scraping error: {e}, data:{test_s}")
-            return {}
     # **Pass Content to Gemini AI**
     full_prompt = (
@@ -191,8 +238,6 @@ def get_data_from_url(url):
 def process_multiple_search_terms(search_terms):
     """
     Process multiple search terms with progress tracking.
@@ -310,6 +355,13 @@ def main():
             "Enter URL to scrape for grant opportunities",
             placeholder="https://example.com/grants"
         )
     # Execute based on input type selection
     if input_type == "Search Query":
@@ -329,8 +381,8 @@ def main():
     else:  # URL input
         if st.sidebar.button("🔍 Scrape URL for Grant Opportunities"):
             if url_input:
-                with st.spinner("Scraping URL... Please wait patiently."):
-                    result = get_data_from_url(url_input)
                     st.session_state.scraped_data = result
                     if result.get("grants"):
                         st.sidebar.success(f"✅ Found {len(result['grants'])} grant opportunities from the URL!")
@@ -408,4 +460,4 @@ def main():
     )
 if __name__ == "__main__":
-    main()

 import io
 import time
 import urllib.parse
+import asyncio
 from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.text_splitter import CharacterTextSplitter
 import requests
 # Import Supadata and initialize the client
 from supadata import Supadata, SupadataError
+# Import Crawl4AI
+from crawl4ai import AsyncWebCrawler
 SUPADATA_API_KEY = os.getenv("SUPADATA")
 supadata = Supadata(api_key=SUPADATA_API_KEY)
     "headless": True
 }
 def get_data(search_term):
     """
     Run the SearchGraph for a given search term.
 SUPADATA_API_KEY = os.getenv("SUPADATA")
+def get_data_from_url(url, scraping_tool="supadata"):
     """
+    Scrape the provided URL using the selected scraping tool.
+    Args:
+        url: The URL to scrape
+        scraping_tool: Either "supadata", "crawl4ai", or "playwright"
+    Returns:
+        Dictionary containing the extracted grant data
     """
     page_content = None  # Placeholder for storing scraped page content
+    # Choose the scraping method based on the selected tool
+    if scraping_tool == "crawl4ai":
         try:
+            # Use Crawl4AI for scraping
+            async def run_crawler():
+                async with AsyncWebCrawler() as crawler:
+                    result = await crawler.arun(url=url)
+                    return result.markdown
+            # Run the async crawler in a synchronous context
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            page_content = loop.run_until_complete(run_crawler())
+            loop.close()
+            st.success("Successfully scraped using Crawl4AI")
         except Exception as e:
+            st.error(f"Error using Crawl4AI: {e}")
+            # Fall back to Supadata if Crawl4AI fails
+            st.warning("Falling back to Supadata scraper...")
+            scraping_tool = "supadata"
+    if scraping_tool == "playwright":
         try:
+            loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
+            data = loader.aload()
+            page_content = data[0].page_content if data else ""
+            st.success("Successfully scraped using Playwright")
+        except Exception as e:
+            st.error(f"Error using Playwright: {e}")
+            # Fall back to Supadata if Playwright fails
+            st.warning("Falling back to Supadata scraper...")
+            scraping_tool = "supadata"
+    if scraping_tool == "supadata":
+        # **Step 1: Attempt Supadata's Built-in Scraper**
+        try:
+            web_content = supadata.web.scrape(url)
+            page_content = web_content.content
+            st.success("Successfully scraped using Supadata built-in scraper")
+        except TypeError as te:
+            if "unexpected keyword argument 'type'" in str(te):
+                st.warning("Falling back to Supadata API due to unexpected keyword 'type' error.")
             else:
+                st.error(f"Unexpected error in Supadata scrape: {te}")
+        # **Step 2: If Supadata's Built-in Scraper Fails, Use Supadata API**
+        if not page_content:
+            try:
+                api_url = "https://api.supadata.ai/v1/web/scrape"
+                headers = {"X-API-Key": SUPADATA_API_KEY}
+                response = requests.get(api_url, headers=headers, params={"url": url})
+                if response.status_code == 200:
+                    page_content = response.json().get("content", "")
+                    st.success("Successfully scraped using Supadata API")
+                else:
+                    st.error(f"Supadata API failed with status {response.status_code}")
+            except Exception as e:
+                st.error(f"Error calling Supadata API: {e}")
+        # **Step 3: If Supadata API Fails, Use Direct Web Request**
+        if not page_content:
+            try:
+                r = requests.get(url, timeout=10)
+                if r.status_code == 200:
+                    page_content = r.text
+                    st.success("Successfully retrieved content with direct request")
+                else:
+                    st.error(f"Manual scraping failed with status code {r.status_code}")
+                    return {}
+            except Exception as e:
+                st.error(f"Manual scraping error: {e}")
                 return {}
+    # If we still don't have content after all attempts
+    if not page_content:
+        st.error("Failed to retrieve content from the URL with all available methods")
+        return {}
     # **Pass Content to Gemini AI**
     full_prompt = (
 def process_multiple_search_terms(search_terms):
     """
     Process multiple search terms with progress tracking.
             "Enter URL to scrape for grant opportunities",
             placeholder="https://example.com/grants"
         )
+        # Scraping tool selector
+        scraping_tool = st.sidebar.radio(
+            "Select Scraping Tool:",
+            ("Supadata", "Crawl4AI", "Playwright"),
+            key="scraping_tool_selector"
+        )
     # Execute based on input type selection
     if input_type == "Search Query":
     else:  # URL input
         if st.sidebar.button("🔍 Scrape URL for Grant Opportunities"):
             if url_input:
+                with st.spinner(f"Scraping URL using {scraping_tool}... Please wait patiently."):
+                    result = get_data_from_url(url_input, scraping_tool.lower())
                     st.session_state.scraped_data = result
                     if result.get("grants"):
                         st.sidebar.success(f"✅ Found {len(result['grants'])} grant opportunities from the URL!")
     )
 if __name__ == "__main__":
+    main()