Craw4ai-example

Sleeping

App Files Files Community

rairo commited on Apr 7, 2025

Commit

45e9c8a

verified ·

1 Parent(s): 605e112

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -451

app.py CHANGED Viewed

@@ -1,463 +1,59 @@
 import streamlit as st
-import pandas as pd
-import base64
-import json
-from scrapegraphai.graphs import SearchGraph
-import nest_asyncio
-import os
-import subprocess
-import io
-import time
-import urllib.parse
 import asyncio
-from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
-from langchain.vectorstores import FAISS
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.chains import ConversationalRetrievalChain
-from langchain.memory import ConversationBufferMemory
-from google import genai
-from google.genai import types
-from langchain_community.document_loaders import PlaywrightURLLoader
-import requests
-# Import Supadata and initialize the client
-from supadata import Supadata, SupadataError
-# Import Crawl4AI
 from crawl4ai import AsyncWebCrawler
-SUPADATA_API_KEY = os.getenv("SUPADATA")
-supadata = Supadata(api_key=SUPADATA_API_KEY)
-# Ensure Playwright installs required browsers and dependencies
-subprocess.run(["playwright", "install"])
-nest_asyncio.apply()
-GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
-graph_config = {
-    "llm": {
-        "api_key": GOOGLE_API_KEY,
-        "model": "google_genai/gemini-2.0-flash-lite",
-    },
-    "max_results": 8,
-    "verbose": True,
-    "headless": True
-}
-def get_data(search_term):
-    """
-    Run the SearchGraph for a given search term.
-    If a rate-limit error (202) occurs, wait 10 seconds and retry.
-    If no results are returned or an error persists, notify the user.
-    """
-    full_prompt = (
-        f"search for {search_term} grants\n\n"
-        "List me all grants or funds with:\n"
-        "- Grant name/title\n"
-        "- Short summary \n"
-        "- Funding organization\n"
-        "- Grant value (numeric only)\n"
-        "- Application deadline\n"
-        "- Eligible countries\n"
-        "- Sector/field\n"
-        "- Eligibility criteria\n"
-        "Return in JSON format."
-    )
-    try:
-        search_graph = SearchGraph(
-            prompt=full_prompt,
-            config=graph_config,
         )
-        result = search_graph.run()
-        if not result or not result.get("grants"):
-            st.error(f"No results returned for {search_term}. Please try again with a different search term.")
-            return {}
-        return result
-    except Exception as e:
-        err_str = str(e)
-        if "202" in err_str:
-            st.warning("Rate limit reached (202). Waiting 10 seconds before retrying...")
-            time.sleep(10)
-            try:
-                search_graph = SearchGraph(
-                    prompt=full_prompt,
-                    config=graph_config,
-                )
-                result = search_graph.run()
-                if not result or not result.get("grants"):
-                    st.error(f"No results returned for {search_term}. Please try again with a different search term.")
-                    return {}
-                return result
-            except Exception as e2:
-                st.error(f"Retry failed: {e2}. Please try again later.")
-                return {}
-        else:
-            st.error(f"An error occurred for search term: {search_term}, error: {e}. Please try again.")
-            return {}
-SUPADATA_API_KEY = os.getenv("SUPADATA")
-def get_data_from_url(url, scraping_tool="supadata"):
-    """
-    Scrape the provided URL using the selected scraping tool.
-    Args:
-        url: The URL to scrape
-        scraping_tool: Either "supadata", "crawl4ai", or "playwright"
-    Returns:
-        Dictionary containing the extracted grant data
-    """
-    page_content = None  # Placeholder for storing scraped page content
-    # Choose the scraping method based on the selected tool
-    if scraping_tool == "crawl4ai":
         try:
-            # Use Crawl4AI for scraping
-            async def run_crawler():
-                async with AsyncWebCrawler() as crawler:
-                    result = await crawler.arun(url=url)
-                    return result.markdown
-            # Run the async crawler in a synchronous context
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-            page_content = loop.run_until_complete(run_crawler())
-            loop.close()
-            st.success("Successfully scraped using Crawl4AI")
-        except Exception as e:
-            st.error(f"Error using Crawl4AI: {e}")
-            # Fall back to Supadata if Crawl4AI fails
-            st.warning("Falling back to Supadata scraper...")
-            scraping_tool = "supadata"
-    if scraping_tool == "playwright":
-        try:
-            loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
-            data = loader.aload()
-            page_content = data[0].page_content if data else ""
-            st.success("Successfully scraped using Playwright")
         except Exception as e:
-            st.error(f"Error using Playwright: {e}")
-            # Fall back to Supadata if Playwright fails
-            st.warning("Falling back to Supadata scraper...")
-            scraping_tool = "supadata"
-    if scraping_tool == "supadata":
-        # **Step 1: Attempt Supadata's Built-in Scraper**
-        try:
-            web_content = supadata.web.scrape(url)
-            page_content = web_content.content
-            st.success("Successfully scraped using Supadata built-in scraper")
-        except TypeError as te:
-            if "unexpected keyword argument 'type'" in str(te):
-                st.warning("Falling back to Supadata API due to unexpected keyword 'type' error.")
-            else:
-                st.error(f"Unexpected error in Supadata scrape: {te}")
-        # **Step 2: If Supadata's Built-in Scraper Fails, Use Supadata API**
-        if not page_content:
-            try:
-                api_url = "https://api.supadata.ai/v1/web/scrape"
-                headers = {"X-API-Key": SUPADATA_API_KEY}
-                response = requests.get(api_url, headers=headers, params={"url": url})
-                if response.status_code == 200:
-                    page_content = response.json().get("content", "")
-                    st.success("Successfully scraped using Supadata API")
-                else:
-                    st.error(f"Supadata API failed with status {response.status_code}")
-            except Exception as e:
-                st.error(f"Error calling Supadata API: {e}")
-        # **Step 3: If Supadata API Fails, Use Direct Web Request**
-        if not page_content:
-            try:
-                r = requests.get(url, timeout=10)
-                if r.status_code == 200:
-                    page_content = r.text
-                    st.success("Successfully retrieved content with direct request")
-                else:
-                    st.error(f"Manual scraping failed with status code {r.status_code}")
-                    return {}
-            except Exception as e:
-                st.error(f"Manual scraping error: {e}")
-                return {}
-    # If we still don't have content after all attempts
-    if not page_content:
-        st.error("Failed to retrieve content from the URL with all available methods")
-        return {}
-    # **Pass Content to Gemini AI**
-    full_prompt = (
-        "Extract the following grant data from the provided web content. "
-        "- Grant name/title\n"
-        "- Short summary\n"
-        "- Funding organization\n"
-        "- Grant value (numeric only)\n"
-        "- Application deadline\n"
-        "- Eligible countries\n"
-        "- Sector/field\n"
-        "- Eligibility criteria\n"
-        "Return in JSON format.\n\n"
-        f"Web content: {page_content}"
-    )
-    client = genai.Client(api_key=GOOGLE_API_KEY)
-    new_answer = client.models.generate_content(
-        model="models/gemini-2.0-flash-lite",
-        contents=f"{full_prompt}, return the json string and nothing else"
-    )
-    response = new_answer.text
-    # **Extract JSON Output from Gemini**
-    try:
-        start_index = response.find('[')
-        end_index = response.rfind(']') + 1
-        json_string = response[start_index:end_index]
-        result = json.loads(json_string)
-    except Exception as parse_error:
-        st.error(f"Error parsing JSON from Gemini model response. Response: {response}")
-        return {}
-    # **Ensure JSON is Wrapped Correctly**
-    if isinstance(result, list):
-        result = {"grants": result}
-    if not result.get("grants"):
-        st.error("No grant opportunities found in the scraped URL.")
-        return {}
-    st.success(f"First grant opportunity: {result['grants'][0]}")
-    return result
-def process_multiple_search_terms(search_terms):
-    """
-    Process multiple search terms with progress tracking.
-    Returns a dictionary with a 'grants' key containing combined results.
-    """
-    all_data = {"grants": []}
-    progress_bar = st.progress(0)
-    status_container = st.empty()
-    total_terms = len(search_terms)
-    for index, term in enumerate(search_terms):
-        term = term.strip()
-        if not term:
-            continue
-        progress = (index + 1) / total_terms
-        progress_bar.progress(progress)
-        status_container.markdown(
-            f"""
-**Processing Grant Opportunities** 🚀
-Searching term {index+1} of {total_terms}:  `{term}`
-<br>
-<p style='font-size: 0.9em; color: #6699CC;'>Completed: {index}/{total_terms} | Remaining: {total_terms - index - 1}</p>
-""",
-            unsafe_allow_html=True,
-        )
-        result = get_data(term)
-        if result and result.get("grants"):
-            all_data["grants"].extend(result["grants"])
-    progress_bar.empty()
-    status_container.empty()
-    if not all_data["grants"]:
-        st.error("No grant opportunities were found. Please try again with different search terms.")
-    return all_data
-def convert_to_csv(data):
-    df = pd.DataFrame(data["grants"])
-    return df.to_csv(index=False).encode("utf-8")
-def convert_to_excel(data):
-    df = pd.DataFrame(data["grants"])
-    buffer = io.BytesIO()
-    with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
-        df.to_excel(writer, sheet_name="Grants", index=False)
-    return buffer.getvalue()
-def create_knowledge_base(data):
-    # Store JSON representation of data in session state
-    st.session_state.knowledge_base_json = json.dumps(data, indent=2)
-def chat_with_knowledge_base(query):
-    if "knowledge_base_json" not in st.session_state:
-        return "Knowledge base not initialized. Please load grant data first."
-    context = st.session_state.knowledge_base_json
-    prompt = f"""
-You are an AI assistant that helps users analyze grant opportunities.
-Here is the extracted grant data in JSON format:
-{context}
-User's question: {query}
-Answer the question based on the provided grant data.
-"""
-    llm = ChatGoogleGenerativeAI(
-        model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0
-    )
-    response = llm.invoke(prompt)
-    return response.content
-def get_shareable_link(file_data, file_name, file_type):
-    b64 = base64.b64encode(file_data).decode()
-    return f"data:{file_type};base64,{b64}"
-def main():
-    st.set_page_config(page_title="Quantilytix Grant Finder", page_icon="💰", layout="wide")
-    st.title("💰 Quantilytix Grant Finder")
-    st.markdown("""
-<div style="text-align: justify;">
-    <p>
-        Welcome to <b>Quantilytix Grant Finder</b>, an AI-powered platform designed to streamline the grant discovery process, especially for academics and researchers across the globe.
-    </p>
-</div>
-""", unsafe_allow_html=True)
-    # Sidebar controls
-    st.sidebar.image("logoqb.jpeg", use_container_width=True)
-    st.sidebar.header("Scrape & Configure")
-    if "scraped_data" not in st.session_state:
-        st.session_state.scraped_data = None
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = []
-    if "chat_interface_active" not in st.session_state:
-        st.session_state.chat_interface_active = False
-    # Sidebar: Input Type Selection
-    input_type = st.sidebar.radio(
-        "Select Input Type:",
-        ("Search Query", "URL"),
-        key="input_type_selector"
-    )
-    # Sidebar: Input field based on selection
-    if input_type == "Search Query":
-        search_input = st.sidebar.text_area(
-            "Enter Search Terms (one per line). Maximum 2",
-            height=150,
-            help="Input search terms to discover grant opportunities. Terms can be specific or generic.",
-            placeholder="e.g.,\nRenewable energy \nclimate change research\nAgriculture in Africa"
-        )
-    else:
-        url_input = st.sidebar.text_input(
-            "Enter URL to scrape for grant opportunities",
-            placeholder="https://example.com/grants"
-        )
-        # Scraping tool selector
-        scraping_tool = st.sidebar.radio(
-            "Select Scraping Tool:",
-            ("Supadata", "Crawl4AI", "Playwright"),
-            key="scraping_tool_selector"
-        )
-    # Execute based on input type selection
-    if input_type == "Search Query":
-        if st.sidebar.button("🔍 Get Grant Opportunities"):
-            if search_input:
-                search_terms = [term.strip() for term in search_input.split("\n") if term.strip()]
-                if search_terms:
-                    with st.spinner("Searching in progress... Please wait patiently."):
-                        result = process_multiple_search_terms(search_terms)
-                        st.session_state.scraped_data = result
-                        if result.get("grants"):
-                            st.sidebar.success(f"✅ Found {len(result['grants'])} grant opportunities from {len(search_terms)} search terms!")
-                else:
-                    st.sidebar.warning("⚠️ Please enter valid search terms.")
-            else:
-                st.sidebar.warning("⚠️ Please enter at least one search term to begin.")
-    else:  # URL input
-        if st.sidebar.button("🔍 Scrape URL for Grant Opportunities"):
-            if url_input:
-                with st.spinner(f"Scraping URL using {scraping_tool}... Please wait patiently."):
-                    result = get_data_from_url(url_input, scraping_tool.lower())
-                    st.session_state.scraped_data = result
-                    if result.get("grants"):
-                        st.sidebar.success(f"✅ Found {len(result['grants'])} grant opportunities from the URL!")
-            else:
-                st.sidebar.warning("⚠️ Please enter a valid URL to scrape.")
-    # Sidebar: Download & Share Controls
-    if st.session_state.scraped_data and st.session_state.scraped_data.get('grants'):
-        st.sidebar.markdown("---")
-        st.sidebar.subheader("Download & Share")
-        selected_format = st.sidebar.selectbox("Download As:", ("CSV", "Excel"), key="download_format_selector")
-        if selected_format == "CSV":
-            file_data = convert_to_csv(st.session_state.scraped_data)
-            file_name = "grants_data.csv"
-            file_type = "text/csv"
-        else:
-            file_data = convert_to_excel(st.session_state.scraped_data)
-            file_name = "grants_data.xlsx"
-            file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-        download_link_html = f"<a href='data:{file_type};base64,{base64.b64encode(file_data).decode()}' download='{file_name}'><button style='background-color:#4CAF50;color:white;padding:10px 15px;border:none;border-radius:4px;'>⬇️ Download {selected_format}</button></a>"
-        st.sidebar.markdown(download_link_html, unsafe_allow_html=True)
-        shareable_link = get_shareable_link(file_data, file_name, file_type)
-        whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out these grant opportunities: {shareable_link}')}"
-        email_subject = urllib.parse.quote("Grant Opportunities File")
-        email_body = urllib.parse.quote(f"Download the grant opportunities file here: {shareable_link}")
-        email_url = f"mailto:?subject={email_subject}&body={email_body}"
-        st.sidebar.markdown("<div style='margin-top:10px;'>Share via:</div>", unsafe_allow_html=True)
-        st.sidebar.markdown(f"📱 [WhatsApp]({whatsapp_url}) | 📧 [Email]({email_url})", unsafe_allow_html=True)
-        # Sidebar: Load as Knowledge Base & Chat
-        if st.sidebar.button("🧠 Load as Knowledge Base & Chat"):
-            with st.spinner("Loading data into knowledge base..."):
-                create_knowledge_base(st.session_state.scraped_data)
-                st.session_state.chat_interface_active = True
-                st.session_state.chat_history = []
-            st.sidebar.success("Knowledge base loaded!")
-    # Main area: Data Preview
-    st.markdown("---")
-    if st.session_state.scraped_data and st.session_state.scraped_data.get('grants'):
-        st.header("📊 Found Grant Data")
-        with st.expander(f"📊 Preview Grant Data ({len(st.session_state.scraped_data['grants'])} grants)"):
-            st.dataframe(st.session_state.scraped_data["grants"])
-    # Main area: Chat UI (shown if knowledge base is loaded)
-    if st.session_state.get("chat_interface_active"):
-        st.header("💬 Chat with Grants Bot")
-        query = st.text_input("Your question:", key="chat_input_main")
-        if query:
-            with st.spinner("Generating response..."):
-                response = chat_with_knowledge_base(query)
-                answer = response["answer"] if isinstance(response, dict) and "answer" in response else response
-                st.session_state.chat_history.append({"query": query, "response": answer})
-        if st.session_state.chat_history:
-            st.subheader("Chat History")
-            for chat in reversed(st.session_state.chat_history):
-                st.markdown(
-                    f"<div style='padding: 10px; border-radius: 5px; margin-bottom: 5px; background-color:#444444; color: white;'><strong>You:</strong> {chat['query']}</div>",
-                    unsafe_allow_html=True)
-                st.markdown(
-                    f"<div style='padding: 10px; border-radius: 5px; margin-bottom: 10px; background-color:#007BFF; color: white;'><strong>Grants Bot:</strong> {chat['response']}</div>",
-                    unsafe_allow_html=True)
-    else:
-        st.info("⬅️  Enter search terms or a URL in the sidebar and click the appropriate button to start searching.")
-    st.sidebar.markdown("---")
-    st.sidebar.markdown(
-        """
-<div style='text-align: center; font-size: 0.8em; color: grey;'>
-    Powered by <a href="https://quantilytix.com" style='color: grey;'>Quantilytix</a> | &copy; 2025
-</div>
-""",
-        unsafe_allow_html=True,
-    )
-if __name__ == "__main__":
-    main()

 import streamlit as st
 import asyncio
 from crawl4ai import AsyncWebCrawler
+st.set_page_config(page_title="Web Crawler App", layout="wide")
+st.title("Web Crawler App")
+# Input for URL
+url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")
+# Optional parameters
+with st.expander("Advanced Options"):
+    max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
+    timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
+    max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
+# Function to run the crawler
+async def run_crawler(url, max_depth=1, timeout=30, max_pages=10):
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=url,
+            max_depth=max_depth,
+            timeout=timeout,
+            max_pages=max_pages
         )
+        return result.markdown
+# Button to start crawling
+if st.button("Start Crawling"):
+    with st.spinner("Crawling in progress..."):
+        # We need to run the async function in a way that works with Streamlit
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
         try:
+            result = loop.run_until_complete(run_crawler(
+                url=url,
+                max_depth=max_depth,
+                timeout=timeout,
+                max_pages=max_pages
+            ))
+            # Display the results
+            st.subheader("Crawl Results")
+            st.markdown(result)
+            # Option to download results
+            st.download_button(
+                label="Download Results",
+                data=result,
+                file_name="crawl_results.md",
+                mime="text/markdown"
+            )
         except Exception as e:
+            st.error(f"An error occurred: {str(e)}")
+        finally:
+            loop.close()
+# Add footer with information
+st.markdown("---")
+st.markdown("This app uses the crawl4ai library to extract content from web pages.")