Spaces:

rairo
/

QuantGrantsList

Sleeping

App Files Files Community

rairo commited on Feb 17, 2025

Commit

6032808

verified ·

1 Parent(s): a1b4ab9

Update app.py

Browse files

Files changed (1) hide show

app.py +177 -158

app.py CHANGED Viewed

@@ -18,183 +18,202 @@ import urllib.parse
 subprocess.run(["playwright", "install"])
 nest_asyncio.apply()
-GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
 graph_config = {
-    "llm": {
-        "api_key": GOOGLE_API_KEY,
-        "model": "google_genai/gemini-pro",
-    },
 }
 def get_data(url):
-    smart_scraper_graph = SmartScraperGraph(
-        prompt="List me all grants or funds,short summary of grant description,the organisations funding them, The value of the grant as an integer, the due date, eligible countries, sector and eligibility criteria for applicants.",
-        source=url,
-        config=graph_config
-    )
-    return smart_scraper_graph.run()
 def process_multiple_urls(urls):
-    """
-    Process multiple URLs with progress tracking
-    """
-    all_data = {"grants": []}
-    progress_bar = st.progress(0)
-    status_container = st.empty()
-    total_urls = len(urls)
-    for index, url in enumerate(urls):
-        try:
-            url = url.strip()
-            if not url:
-                continue
-            # Update progress
-            progress = (index + 1) / total_urls
-            progress_bar.progress(progress)
-            # Show current status
-            status_container.markdown(f"""
-            **Processing URL {index+1} of {total_urls}**
-            🔍 Scanning: `{url}`
-            ✅ Completed: {index}/{total_urls}
-            ⏳ Remaining: {total_urls - index - 1}
-            """)
-            # Scrape data
-            result = get_data(url)
-            if result and 'grants' in result:
-                all_data['grants'].extend(result['grants'])
-        except Exception as e:
-            st.error(f"Error processing {url}: {str(e)}")
-            continue
-    progress_bar.empty()
-    status_container.empty()
-    return all_data
 def convert_to_csv(data):
-    df = pd.DataFrame(data['grants'])
-    return df.to_csv(index=False).encode('utf-8')
 def convert_to_excel(data):
-    df = pd.DataFrame(data['grants'])
-    buffer = io.BytesIO()
-    with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
-        df.to_excel(writer, sheet_name='Grants', index=False)
-    return buffer.getvalue()
 def create_knowledge_base(data):
-    documents = []
-    for grant in data['grants']:
-        doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
-        documents.append("\n".join(doc_parts))
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    texts = text_splitter.create_documents(documents)
-    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
-    vectorstore = FAISS.from_documents(texts, embeddings)
-    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0)
-    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-    return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
 def get_shareable_link(file_data, file_name, file_type):
-    b64 = base64.b64encode(file_data).decode()
-    return f"data:{file_type};base64,{b64}"
 def main():
-    st.sidebar.title("Quantilytix Grant Scraper")
-    st.sidebar.image("logoqb.jpeg", use_container_width=True)
-    # Initialize session state for scraped data and chat history if not already present
-    if "scraped_data" not in st.session_state:
-        st.session_state.scraped_data = None
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = []
-    if "chat_interface_active" not in st.session_state:
-        st.session_state.chat_interface_active = False
-    # Multi-url input
-    url_input = st.sidebar.text_area(
-        "Enter URLs (one per line)",
-        height=150,
-        help="Enter multiple URLs separated by new lines"
-    )
-    if st.sidebar.button("Get grants"):
-        if url_input:
-            urls = [url.strip() for url in url_input.split('\n') if url.strip()]
-            if urls:
-                try:
-                    with st.spinner("Starting scraping process..."):
-                        result = process_multiple_urls(urls)
-                        st.session_state.scraped_data = result
-                        st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
-                except Exception as e:
-                    st.error(f"Error in scraping process: {e}")
-            else:
-                st.warning("Please enter valid URLs.")
-        else:
-            st.warning("Please enter at least one URL.")
-    if st.session_state.scraped_data:
-        selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
-        result = st.session_state.scraped_data
-        if selected_format == "CSV":
-            file_data = convert_to_csv(result)
-            file_name = "grants.csv"
-            file_type = "text/csv"
-        else:
-            file_data = convert_to_excel(result)
-            file_name = "grants.xlsx"
-            file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-        b64 = base64.b64encode(file_data).decode()
-        download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
-        st.sidebar.markdown(download_link, unsafe_allow_html=True)
-        shareable_link = get_shareable_link(file_data, file_name, file_type)
-        st.sidebar.markdown("---")
-        st.sidebar.markdown("**Share Options:**")
-        whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
-        st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
-        email_subject = urllib.parse.quote("Check out this grants file")
-        email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
-        email_url = f"mailto:?subject={email_subject}&body={email_body}"
-        st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
-        with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
-            st.dataframe(result['grants'])
-        if st.sidebar.button("Load data as Knowledge Base"):
-            st.session_state.qa_chain = create_knowledge_base(result)
-            st.session_state.chat_interface_active = True
-            st.session_state.chat_history = [] # Initialize chat_history here when KB is loaded
-    if "chat_interface_active" in st.session_state and st.session_state.chat_interface_active:
-        st.header("Chat Interface Loaded. Start asking questions about the grants!")
-        st.image("logoqb.jpeg", width=150)
-        query = st.text_input("Ask a question about the grants:", key="chat_input")
-        if query:
-            if st.session_state.qa_chain:
-                response = st.session_state.qa_chain({"question": query})
-                st.session_state.chat_history.append({"query": query, "response": response['answer']})
-            else:
-                st.error("Knowledge base not loaded. Please load the knowledge base first.")
-        if "chat_history" in st.session_state: # Check if chat_history exists before iterating
-            for chat in st.session_state.chat_history:
-                st.markdown(f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>", unsafe_allow_html=True)
-                st.markdown(f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>", unsafe_allow_html=True)
 if __name__ == "__main__":
-    main()

 subprocess.run(["playwright", "install"])
 nest_asyncio.apply()
+GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
 graph_config = {
+    "llm": {
+        "api_key": GOOGLE_API_KEY,
+        "model": "google_genai/gemini-pro",
+    },
 }
 def get_data(url):
+    smart_scraper_graph = SmartScraperGraph(
+        prompt=(
+            "List me all grants or funds, short summary of grant description, "
+            "the organisations funding them, the value of the grant as an integer, "
+            "the due date, eligible countries, sector and eligibility criteria for applicants."
+        ),
+        source=url,
+        config=graph_config,
+    )
+    return smart_scraper_graph.run()
 def process_multiple_urls(urls):
+    """
+    Process multiple URLs with progress tracking
+    """
+    all_data = {"grants": []}
+    progress_bar = st.progress(0)
+    status_container = st.empty()
+    total_urls = len(urls)
+    for index, url in enumerate(urls):
+        try:
+            url = url.strip()
+            if not url:
+                continue
+            # Update progress
+            progress = (index + 1) / total_urls
+            progress_bar.progress(progress)
+            # Show current status
+            status_container.markdown(
+                f"""
+**Processing URL {index+1} of {total_urls}**
+🔍 Scanning: `{url}`
+✅ Completed: {index}/{total_urls}
+⏳ Remaining: {total_urls - index - 1}
+"""
+            )
+            # Scrape data
+            result = get_data(url)
+            if result and "grants" in result:
+                all_data["grants"].extend(result["grants"])
+        except Exception as e:
+            st.error(f"Error processing {url}: {str(e)}")
+            continue
+    progress_bar.empty()
+    status_container.empty()
+    return all_data
 def convert_to_csv(data):
+    df = pd.DataFrame(data["grants"])
+    return df.to_csv(index=False).encode("utf-8")
 def convert_to_excel(data):
+    df = pd.DataFrame(data["grants"])
+    buffer = io.BytesIO()
+    with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
+        df.to_excel(writer, sheet_name="Grants", index=False)
+    return buffer.getvalue()
 def create_knowledge_base(data):
+    documents = []
+    for grant in data["grants"]:
+        doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
+        documents.append("\n".join(doc_parts))
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    texts = text_splitter.create_documents(documents)
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
+    vectorstore = FAISS.from_documents(texts, embeddings)
+    llm = ChatGoogleGenerativeAI(
+        model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0
+    )
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+    return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
 def get_shareable_link(file_data, file_name, file_type):
+    b64 = base64.b64encode(file_data).decode()
+    return f"data:{file_type};base64,{b64}"
 def main():
+    st.sidebar.title("Quantilytix Grant Scraper")
+    st.sidebar.image("logoqb.jpeg", use_container_width=True)
+    # Initialize session state for scraped data and chat history if not already present
+    if "scraped_data" not in st.session_state:
+        st.session_state.scraped_data = None
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+    if "chat_interface_active" not in st.session_state:
+        st.session_state.chat_interface_active = False
+    # Multi-URL input
+    url_input = st.sidebar.text_area(
+        "Enter URLs (one per line)",
+        height=150,
+        help="Enter multiple URLs separated by new lines",
+    )
+    if st.sidebar.button("Get grants"):
+        if url_input:
+            urls = [url.strip() for url in url_input.split("\n") if url.strip()]
+            if urls:
+                try:
+                    with st.spinner("Starting scraping process..."):
+                        result = process_multiple_urls(urls)
+                        st.session_state.scraped_data = result
+                        st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
+                except Exception as e:
+                    st.error(f"Error in scraping process: {e}")
+            else:
+                st.warning("Please enter valid URLs.")
+        else:
+            st.warning("Please enter at least one URL.")
+    if st.session_state.scraped_data:
+        selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
+        result = st.session_state.scraped_data
+        if selected_format == "CSV":
+            file_data = convert_to_csv(result)
+            file_name = "grants.csv"
+            file_type = "text/csv"
+        else:
+            file_data = convert_to_excel(result)
+            file_name = "grants.xlsx"
+            file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+        b64 = base64.b64encode(file_data).decode()
+        download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
+        st.sidebar.markdown(download_link, unsafe_allow_html=True)
+        shareable_link = get_shareable_link(file_data, file_name, file_type)
+        st.sidebar.markdown("---")
+        st.sidebar.markdown("**Share Options:**")
+        whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
+        st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
+        email_subject = urllib.parse.quote("Check out this grants file")
+        email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
+        email_url = f"mailto:?subject={email_subject}&body={email_body}"
+        st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
+        with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
+            st.dataframe(result["grants"])
+        if st.sidebar.button("Load data as Knowledge Base"):
+            st.session_state.qa_chain = create_knowledge_base(result)
+            st.session_state.chat_interface_active = True
+            st.session_state.chat_history = []  # Initialize chat_history when KB is loaded
+    if st.session_state.get("chat_interface_active"):
+        st.header("Chat Interface Loaded. Start asking questions about the grants!")
+        st.image("logoqb.jpeg", width=150)
+        query = st.text_input("Ask a question about the grants:", key="chat_input")
+        if query:
+            if st.session_state.qa_chain:
+                response = st.session_state.qa_chain({"question": query})
+                st.session_state.chat_history.append({"query": query, "response": response["answer"]})
+            else:
+                st.error("Knowledge base not loaded. Please load the knowledge base first.")
+        for chat in st.session_state.get("chat_history", []):
+            st.markdown(
+                f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>",
+                unsafe_allow_html=True,
+            )
+            st.markdown(
+                f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>",
+                unsafe_allow_html=True,
+            )
 if __name__ == "__main__":
+    main()