Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,11 @@ import nest_asyncio
|
|
| 7 |
import os
|
| 8 |
import subprocess
|
| 9 |
import io
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# Ensure Playwright installs required browsers and dependencies
|
| 12 |
subprocess.run(["playwright", "install"])
|
|
@@ -14,7 +19,6 @@ subprocess.run(["playwright", "install"])
|
|
| 14 |
|
| 15 |
nest_asyncio.apply()
|
| 16 |
|
| 17 |
-
|
| 18 |
GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
|
| 19 |
|
| 20 |
graph_config = {
|
|
@@ -25,39 +29,68 @@ graph_config = {
|
|
| 25 |
}
|
| 26 |
|
| 27 |
def get_data(url):
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
return result
|
| 49 |
|
| 50 |
def convert_to_csv(data):
|
| 51 |
-
|
| 52 |
-
|
| 53 |
|
| 54 |
def convert_to_excel(data):
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
def main():
|
| 63 |
st.sidebar.title("Quantilytix Grant Scraper")
|
|
@@ -70,6 +103,9 @@ def main():
|
|
| 70 |
if "chat_history" not in st.session_state:
|
| 71 |
st.session_state.chat_history = []
|
| 72 |
|
|
|
|
|
|
|
|
|
|
| 73 |
if st.sidebar.button("Get grants"):
|
| 74 |
if url:
|
| 75 |
try:
|
|
@@ -101,6 +137,7 @@ def main():
|
|
| 101 |
st.dataframe(result['grants'])
|
| 102 |
|
| 103 |
if st.sidebar.button("Load as Knowledge Base"):
|
|
|
|
| 104 |
st.session_state.chat_interface_active = True
|
| 105 |
|
| 106 |
if "chat_interface_active" in st.session_state and st.session_state.chat_interface_active:
|
|
@@ -108,9 +145,11 @@ def main():
|
|
| 108 |
|
| 109 |
query = st.text_input("Ask a question about the grants:", key="chat_input")
|
| 110 |
if query:
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
| 114 |
|
| 115 |
# Display chat history
|
| 116 |
for chat in st.session_state.chat_history:
|
|
|
|
| 7 |
import os
|
| 8 |
import subprocess
|
| 9 |
import io
|
| 10 |
+
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
|
| 11 |
+
from langchain.vectorstores import FAISS
|
| 12 |
+
from langchain.text_splitter import CharacterTextSplitter
|
| 13 |
+
from langchain.chains import ConversationalRetrievalChain
|
| 14 |
+
from langchain.memory import ConversationBufferMemory
|
| 15 |
|
| 16 |
# Ensure Playwright installs required browsers and dependencies
|
| 17 |
subprocess.run(["playwright", "install"])
|
|
|
|
| 19 |
|
| 20 |
nest_asyncio.apply()
|
| 21 |
|
|
|
|
| 22 |
GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
|
| 23 |
|
| 24 |
graph_config = {
|
|
|
|
| 29 |
}
|
| 30 |
|
| 31 |
def get_data(url):
|
| 32 |
+
"""
|
| 33 |
+
Fetches data from the given URL using scrapegraphai.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
url: The URL to scrape.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
A dictionary containing the extracted data in the following format:
|
| 40 |
+
{'grants': [{'grant_name': ..., 'funding_organisation': ...,
|
| 41 |
+
'due_date': ..., 'eligible_countries': ...,
|
| 42 |
+
'eligibility_conditions': ...}, ...]}
|
| 43 |
+
"""
|
| 44 |
+
smart_scraper_graph = SmartScraperGraph(
|
| 45 |
+
prompt="List me all grants or funds,short summary of grant description,the organisations funding them, The value of the grant as an integer, the due date, eligible countries and eligibility criteria for applicants.",
|
| 46 |
+
source=url,
|
| 47 |
+
config=graph_config
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
result = smart_scraper_graph.run()
|
| 51 |
+
return result
|
|
|
|
| 52 |
|
| 53 |
def convert_to_csv(data):
|
| 54 |
+
df = pd.DataFrame(data['grants'])
|
| 55 |
+
return df.to_csv(index=False).encode('utf-8')
|
| 56 |
|
| 57 |
def convert_to_excel(data):
|
| 58 |
+
df = pd.DataFrame(data['grants'])
|
| 59 |
+
buffer = io.BytesIO()
|
| 60 |
+
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
|
| 61 |
+
df.to_excel(writer, sheet_name='Grants', index=False)
|
| 62 |
+
return buffer.getvalue()
|
| 63 |
+
|
| 64 |
+
def create_knowledge_base(data):
|
| 65 |
+
"""
|
| 66 |
+
Creates a knowledge base from the scraped data using FAISS and GoogleGenerativeAIEmbeddings.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
data: The scraped data in dictionary format.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
A ConversationalRetrievalChain object for querying the knowledge base.
|
| 73 |
+
"""
|
| 74 |
+
# Convert the data into a list of strings
|
| 75 |
+
documents = []
|
| 76 |
+
for grant in data['grants']:
|
| 77 |
+
doc = f"Grant Name: {grant['grant_name']}\nFunding Organisation: {grant['funding_organisation']}\nDue Date: {grant['due_date']}\nEligible Countries: {grant['eligible_countries']}\nEligibility Conditions: {grant['eligibility_conditions']}"
|
| 78 |
+
documents.append(doc)
|
| 79 |
+
|
| 80 |
+
# Split the documents into chunks
|
| 81 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 82 |
+
texts = text_splitter.create_documents(documents)
|
| 83 |
+
|
| 84 |
+
# Create embeddings and store them in FAISS
|
| 85 |
+
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
|
| 86 |
+
vectorstore = FAISS.from_documents(texts, embeddings)
|
| 87 |
+
|
| 88 |
+
# Create a conversational retrieval chain
|
| 89 |
+
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY, temperature=0)
|
| 90 |
+
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
|
| 91 |
+
qa_chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
|
| 92 |
+
|
| 93 |
+
return qa_chain
|
| 94 |
|
| 95 |
def main():
|
| 96 |
st.sidebar.title("Quantilytix Grant Scraper")
|
|
|
|
| 103 |
if "chat_history" not in st.session_state:
|
| 104 |
st.session_state.chat_history = []
|
| 105 |
|
| 106 |
+
if "qa_chain" not in st.session_state:
|
| 107 |
+
st.session_state.qa_chain = None
|
| 108 |
+
|
| 109 |
if st.sidebar.button("Get grants"):
|
| 110 |
if url:
|
| 111 |
try:
|
|
|
|
| 137 |
st.dataframe(result['grants'])
|
| 138 |
|
| 139 |
if st.sidebar.button("Load as Knowledge Base"):
|
| 140 |
+
st.session_state.qa_chain = create_knowledge_base(result)
|
| 141 |
st.session_state.chat_interface_active = True
|
| 142 |
|
| 143 |
if "chat_interface_active" in st.session_state and st.session_state.chat_interface_active:
|
|
|
|
| 145 |
|
| 146 |
query = st.text_input("Ask a question about the grants:", key="chat_input")
|
| 147 |
if query:
|
| 148 |
+
if st.session_state.qa_chain:
|
| 149 |
+
response = st.session_state.qa_chain({"question": query})
|
| 150 |
+
st.session_state.chat_history.append({"query": query, "response": response['answer']})
|
| 151 |
+
else:
|
| 152 |
+
st.error("Knowledge base not loaded. Please load the knowledge base first.")
|
| 153 |
|
| 154 |
# Display chat history
|
| 155 |
for chat in st.session_state.chat_history:
|