Craw4ai-example

Sleeping

App Files Files Community

rairo commited on Feb 16, 2025

Commit

a1b4ab9

verified ·

1 Parent(s): 21f4f5f

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -149

app.py CHANGED Viewed

@@ -21,172 +21,180 @@ nest_asyncio.apply()
 GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
 graph_config = {
-    "llm": {
-        "api_key": GOOGLE_API_KEY,
-        "model": "google_genai/gemini-pro",
-    },
 }
 def get_data(url):
-    smart_scraper_graph = SmartScraperGraph(
-        prompt="List me all grants or funds,short summary of grant description,the organisations funding them, The value of the grant as an integer, the due date, eligible countries, sector and eligibility criteria for applicants.",
-        source=url,
-        config=graph_config
-    )
-    return smart_scraper_graph.run()
 def process_multiple_urls(urls):
-    """
-    Process multiple URLs with progress tracking
-    """
-    all_data = {"grants": []}
-    progress_bar = st.progress(0)
-    status_container = st.empty()
-    total_urls = len(urls)
-    for index, url in enumerate(urls):
-        try:
-            url = url.strip()
-            if not url:
-                continue
-            # Update progress
-            progress = (index + 1) / total_urls
-            progress_bar.progress(progress)
-            # Show current status
-            status_container.markdown(f"""
-            **Processing URL {index+1} of {total_urls}**
-            🔍 Scanning: `{url}`
-            ✅ Completed: {index}/{total_urls}
-            ⏳ Remaining: {total_urls - index - 1}
-            """)
-            # Scrape data
-            result = get_data(url)
-            if result and 'grants' in result:
-                all_data['grants'].extend(result['grants'])
-        except Exception as e:
-            st.error(f"Error processing {url}: {str(e)}")
-            continue
-    progress_bar.empty()
-    status_container.empty()
-    return all_data
 def convert_to_csv(data):
-    df = pd.DataFrame(data['grants'])
-    return df.to_csv(index=False).encode('utf-8')
 def convert_to_excel(data):
-    df = pd.DataFrame(data['grants'])
-    buffer = io.BytesIO()
-    with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
-        df.to_excel(writer, sheet_name='Grants', index=False)
-    return buffer.getvalue()
 def create_knowledge_base(data):
-    documents = []
-    for grant in data['grants']:
-        doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
-        documents.append("\n".join(doc_parts))
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    texts = text_splitter.create_documents(documents)
-    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
-    vectorstore = FAISS.from_documents(texts, embeddings)
-    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0)
-    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-    return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
 def get_shareable_link(file_data, file_name, file_type):
-    b64 = base64.b64encode(file_data).decode()
-    return f"data:{file_type};base64,{b64}"
 def main():
-    st.sidebar.title("Quantilytix Grant Scraper")
-    st.sidebar.image("logoqb.jpeg", use_container_width=True)
-    # Multi-url input
-    url_input = st.sidebar.text_area(
-        "Enter URLs (one per line)",
-        height=150,
-        help="Enter multiple URLs separated by new lines"
-    )
-    if "scraped_data" not in st.session_state:
-        st.session_state.scraped_data = None
-    if st.sidebar.button("Get grants"):
-        if url_input:
-            urls = [url.strip() for url in url_input.split('\n') if url.strip()]
-            if urls:
-                try:
-                    with st.spinner("Starting scraping process..."):
-                        result = process_multiple_urls(urls)
-                        st.session_state.scraped_data = result
-                        st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
-                except Exception as e:
-                    st.error(f"Error in scraping process: {e}")
-            else:
-                st.warning("Please enter valid URLs.")
-        else:
-            st.warning("Please enter at least one URL.")
-    if st.session_state.scraped_data:
-        selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
-        result = st.session_state.scraped_data
-        if selected_format == "CSV":
-            file_data = convert_to_csv(result)
-            file_name = "grants.csv"
-            file_type = "text/csv"
-        else:
-            file_data = convert_to_excel(result)
-            file_name = "grants.xlsx"
-            file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-        b64 = base64.b64encode(file_data).decode()
-        download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
-        st.sidebar.markdown(download_link, unsafe_allow_html=True)
-        shareable_link = get_shareable_link(file_data, file_name, file_type)
-        st.sidebar.markdown("---")
-        st.sidebar.markdown("**Share Options:**")
-        whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
-        st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
-        email_subject = urllib.parse.quote("Check out this grants file")
-        email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
-        email_url = f"mailto:?subject={email_subject}&body={email_body}"
-        st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
-        with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
-            st.dataframe(result['grants'])
-        if st.sidebar.button("Load data as Knowledge Base"):
-            st.session_state.qa_chain = create_knowledge_base(result)
-            st.session_state.chat_interface_active = True
-    if "chat_interface_active" in st.session_state and st.session_state.chat_interface_active:
-        st.header("Chat Interface Loaded. Start asking questions about the grants!")
-        st.image("logoqb.jpeg", width=150)
-        query = st.text_input("Ask a question about the grants:", key="chat_input")
-        if query:
-            if st.session_state.qa_chain:
-                response = st.session_state.qa_chain({"question": query})
-                st.session_state.chat_history.append({"query": query, "response": response['answer']})
-            else:
-                st.error("Knowledge base not loaded. Please load the knowledge base first.")
-        for chat in st.session_state.chat_history:
-            st.markdown(f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>", unsafe_allow_html=True)
-            st.markdown(f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>", unsafe_allow_html=True)
 if __name__ == "__main__":
-    main()

 GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
 graph_config = {
+    "llm": {
+        "api_key": GOOGLE_API_KEY,
+        "model": "google_genai/gemini-pro",
+    },
 }
 def get_data(url):
+    smart_scraper_graph = SmartScraperGraph(
+        prompt="List me all grants or funds,short summary of grant description,the organisations funding them, The value of the grant as an integer, the due date, eligible countries, sector and eligibility criteria for applicants.",
+        source=url,
+        config=graph_config
+    )
+    return smart_scraper_graph.run()
 def process_multiple_urls(urls):
+    """
+    Process multiple URLs with progress tracking
+    """
+    all_data = {"grants": []}
+    progress_bar = st.progress(0)
+    status_container = st.empty()
+    total_urls = len(urls)
+    for index, url in enumerate(urls):
+        try:
+            url = url.strip()
+            if not url:
+                continue
+            # Update progress
+            progress = (index + 1) / total_urls
+            progress_bar.progress(progress)
+            # Show current status
+            status_container.markdown(f"""
+            **Processing URL {index+1} of {total_urls}**
+            🔍 Scanning: `{url}`
+            ✅ Completed: {index}/{total_urls}
+            ⏳ Remaining: {total_urls - index - 1}
+            """)
+            # Scrape data
+            result = get_data(url)
+            if result and 'grants' in result:
+                all_data['grants'].extend(result['grants'])
+        except Exception as e:
+            st.error(f"Error processing {url}: {str(e)}")
+            continue
+    progress_bar.empty()
+    status_container.empty()
+    return all_data
 def convert_to_csv(data):
+    df = pd.DataFrame(data['grants'])
+    return df.to_csv(index=False).encode('utf-8')
 def convert_to_excel(data):
+    df = pd.DataFrame(data['grants'])
+    buffer = io.BytesIO()
+    with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
+        df.to_excel(writer, sheet_name='Grants', index=False)
+    return buffer.getvalue()
 def create_knowledge_base(data):
+    documents = []
+    for grant in data['grants']:
+        doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
+        documents.append("\n".join(doc_parts))
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    texts = text_splitter.create_documents(documents)
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
+    vectorstore = FAISS.from_documents(texts, embeddings)
+    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0)
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+    return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
 def get_shareable_link(file_data, file_name, file_type):
+    b64 = base64.b64encode(file_data).decode()
+    return f"data:{file_type};base64,{b64}"
 def main():
+    st.sidebar.title("Quantilytix Grant Scraper")
+    st.sidebar.image("logoqb.jpeg", use_container_width=True)
+    # Initialize session state for scraped data and chat history if not already present
+    if "scraped_data" not in st.session_state:
+        st.session_state.scraped_data = None
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+    if "chat_interface_active" not in st.session_state:
+        st.session_state.chat_interface_active = False
+    # Multi-url input
+    url_input = st.sidebar.text_area(
+        "Enter URLs (one per line)",
+        height=150,
+        help="Enter multiple URLs separated by new lines"
+    )
+    if st.sidebar.button("Get grants"):
+        if url_input:
+            urls = [url.strip() for url in url_input.split('\n') if url.strip()]
+            if urls:
+                try:
+                    with st.spinner("Starting scraping process..."):
+                        result = process_multiple_urls(urls)
+                        st.session_state.scraped_data = result
+                        st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
+                except Exception as e:
+                    st.error(f"Error in scraping process: {e}")
+            else:
+                st.warning("Please enter valid URLs.")
+        else:
+            st.warning("Please enter at least one URL.")
+    if st.session_state.scraped_data:
+        selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
+        result = st.session_state.scraped_data
+        if selected_format == "CSV":
+            file_data = convert_to_csv(result)
+            file_name = "grants.csv"
+            file_type = "text/csv"
+        else:
+            file_data = convert_to_excel(result)
+            file_name = "grants.xlsx"
+            file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+        b64 = base64.b64encode(file_data).decode()
+        download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
+        st.sidebar.markdown(download_link, unsafe_allow_html=True)
+        shareable_link = get_shareable_link(file_data, file_name, file_type)
+        st.sidebar.markdown("---")
+        st.sidebar.markdown("**Share Options:**")
+        whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
+        st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
+        email_subject = urllib.parse.quote("Check out this grants file")
+        email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
+        email_url = f"mailto:?subject={email_subject}&body={email_body}"
+        st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
+        with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
+            st.dataframe(result['grants'])
+        if st.sidebar.button("Load data as Knowledge Base"):
+            st.session_state.qa_chain = create_knowledge_base(result)
+            st.session_state.chat_interface_active = True
+            st.session_state.chat_history = [] # Initialize chat_history here when KB is loaded
+    if "chat_interface_active" in st.session_state and st.session_state.chat_interface_active:
+        st.header("Chat Interface Loaded. Start asking questions about the grants!")
+        st.image("logoqb.jpeg", width=150)
+        query = st.text_input("Ask a question about the grants:", key="chat_input")
+        if query:
+            if st.session_state.qa_chain:
+                response = st.session_state.qa_chain({"question": query})
+                st.session_state.chat_history.append({"query": query, "response": response['answer']})
+            else:
+                st.error("Knowledge base not loaded. Please load the knowledge base first.")
+        if "chat_history" in st.session_state: # Check if chat_history exists before iterating
+            for chat in st.session_state.chat_history:
+                st.markdown(f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>", unsafe_allow_html=True)
+                st.markdown(f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>", unsafe_allow_html=True)
 if __name__ == "__main__":
+    main()