rairo commited on
Commit
32eccb0
·
verified ·
1 Parent(s): 6032808

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -81
app.py CHANGED
@@ -13,6 +13,7 @@ from langchain.text_splitter import CharacterTextSplitter
13
  from langchain.chains import ConversationalRetrievalChain
14
  from langchain.memory import ConversationBufferMemory
15
  import urllib.parse
 
16
 
17
  # Ensure Playwright installs required browsers and dependencies
18
  subprocess.run(["playwright", "install"])
@@ -27,7 +28,6 @@ graph_config = {
27
  },
28
  }
29
 
30
-
31
  def get_data(url):
32
  smart_scraper_graph = SmartScraperGraph(
33
  prompt=(
@@ -40,15 +40,13 @@ def get_data(url):
40
  )
41
  return smart_scraper_graph.run()
42
 
43
-
44
  def process_multiple_urls(urls):
45
  """
46
- Process multiple URLs with progress tracking
47
  """
48
  all_data = {"grants": []}
49
  progress_bar = st.progress(0)
50
  status_container = st.empty()
51
-
52
  total_urls = len(urls)
53
  for index, url in enumerate(urls):
54
  try:
@@ -56,11 +54,8 @@ def process_multiple_urls(urls):
56
  if not url:
57
  continue
58
 
59
- # Update progress
60
  progress = (index + 1) / total_urls
61
  progress_bar.progress(progress)
62
-
63
- # Show current status
64
  status_container.markdown(
65
  f"""
66
  **Processing URL {index+1} of {total_urls}**
@@ -69,25 +64,20 @@ def process_multiple_urls(urls):
69
  ⏳ Remaining: {total_urls - index - 1}
70
  """
71
  )
72
-
73
- # Scrape data
74
  result = get_data(url)
75
  if result and "grants" in result:
76
  all_data["grants"].extend(result["grants"])
77
  except Exception as e:
78
  st.error(f"Error processing {url}: {str(e)}")
79
  continue
80
-
81
  progress_bar.empty()
82
  status_container.empty()
83
  return all_data
84
 
85
-
86
  def convert_to_csv(data):
87
  df = pd.DataFrame(data["grants"])
88
  return df.to_csv(index=False).encode("utf-8")
89
 
90
-
91
  def convert_to_excel(data):
92
  df = pd.DataFrame(data["grants"])
93
  buffer = io.BytesIO()
@@ -95,70 +85,76 @@ def convert_to_excel(data):
95
  df.to_excel(writer, sheet_name="Grants", index=False)
96
  return buffer.getvalue()
97
 
98
-
99
  def create_knowledge_base(data):
100
  documents = []
101
  for grant in data["grants"]:
102
  doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
103
  documents.append("\n".join(doc_parts))
104
-
105
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
106
  texts = text_splitter.create_documents(documents)
107
-
108
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
109
  vectorstore = FAISS.from_documents(texts, embeddings)
110
-
111
  llm = ChatGoogleGenerativeAI(
112
  model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0
113
  )
114
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
115
  return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
116
 
117
-
118
  def get_shareable_link(file_data, file_name, file_type):
119
  b64 = base64.b64encode(file_data).decode()
120
  return f"data:{file_type};base64,{b64}"
121
 
122
-
123
- def main():
124
- st.sidebar.title("Quantilytix Grant Scraper")
125
- st.sidebar.image("logoqb.jpeg", use_container_width=True)
126
-
127
- # Initialize session state for scraped data and chat history if not already present
128
- if "scraped_data" not in st.session_state:
129
- st.session_state.scraped_data = None
130
- if "chat_history" not in st.session_state:
131
- st.session_state.chat_history = []
132
- if "chat_interface_active" not in st.session_state:
133
- st.session_state.chat_interface_active = False
134
-
135
- # Multi-URL input
136
- url_input = st.sidebar.text_area(
 
 
 
 
 
 
 
 
 
 
 
137
  "Enter URLs (one per line)",
138
  height=150,
139
  help="Enter multiple URLs separated by new lines",
140
  )
141
-
142
- if st.sidebar.button("Get grants"):
143
  if url_input:
144
  urls = [url.strip() for url in url_input.split("\n") if url.strip()]
145
  if urls:
146
- try:
147
- with st.spinner("Starting scraping process..."):
148
- result = process_multiple_urls(urls)
149
- st.session_state.scraped_data = result
150
- st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URLs!")
151
- except Exception as e:
152
- st.error(f"Error in scraping process: {e}")
153
  else:
154
  st.warning("Please enter valid URLs.")
155
  else:
156
  st.warning("Please enter at least one URL.")
157
 
158
- if st.session_state.scraped_data:
159
- selected_format = st.sidebar.selectbox("Select Download Format", ("CSV", "Excel"))
 
160
  result = st.session_state.scraped_data
161
-
 
 
 
162
  if selected_format == "CSV":
163
  file_data = convert_to_csv(result)
164
  file_name = "grants.csv"
@@ -167,53 +163,92 @@ def main():
167
  file_data = convert_to_excel(result)
168
  file_name = "grants.xlsx"
169
  file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
170
-
171
  b64 = base64.b64encode(file_data).decode()
172
  download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
173
- st.sidebar.markdown(download_link, unsafe_allow_html=True)
174
-
175
  shareable_link = get_shareable_link(file_data, file_name, file_type)
176
- st.sidebar.markdown("---")
177
- st.sidebar.markdown("**Share Options:**")
178
-
179
  whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
180
- st.sidebar.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
181
-
182
  email_subject = urllib.parse.quote("Check out this grants file")
183
  email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
184
  email_url = f"mailto:?subject={email_subject}&body={email_body}"
185
- st.sidebar.markdown(f"📧 [Share via Email]({email_url})")
186
-
187
- with st.expander(f"Preview of data ({len(result['grants'])} grants)"):
188
- st.dataframe(result["grants"])
189
-
190
- if st.sidebar.button("Load data as Knowledge Base"):
191
- st.session_state.qa_chain = create_knowledge_base(result)
 
 
 
192
  st.session_state.chat_interface_active = True
193
- st.session_state.chat_history = [] # Initialize chat_history when KB is loaded
194
-
195
- if st.session_state.get("chat_interface_active"):
196
- st.header("Chat Interface Loaded. Start asking questions about the grants!")
197
- st.image("logoqb.jpeg", width=150)
198
- query = st.text_input("Ask a question about the grants:", key="chat_input")
199
-
200
- if query:
201
- if st.session_state.qa_chain:
202
  response = st.session_state.qa_chain({"question": query})
203
  st.session_state.chat_history.append({"query": query, "response": response["answer"]})
204
- else:
205
- st.error("Knowledge base not loaded. Please load the knowledge base first.")
206
-
207
- for chat in st.session_state.get("chat_history", []):
208
- st.markdown(
209
- f"<p style='color: #8C92AC;'><strong>You:</strong> {chat['query']}</p>",
210
- unsafe_allow_html=True,
211
- )
212
- st.markdown(
213
- f"<p style='color: #6699CC;'><strong>Grants Bot:</strong> {chat['response']}</p>",
214
- unsafe_allow_html=True,
215
- )
 
 
 
 
 
 
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  if __name__ == "__main__":
219
- main()
 
 
 
 
 
 
 
13
  from langchain.chains import ConversationalRetrievalChain
14
  from langchain.memory import ConversationBufferMemory
15
  import urllib.parse
16
+ import plotly.express as px
17
 
18
  # Ensure Playwright installs required browsers and dependencies
19
  subprocess.run(["playwright", "install"])
 
28
  },
29
  }
30
 
 
31
  def get_data(url):
32
  smart_scraper_graph = SmartScraperGraph(
33
  prompt=(
 
40
  )
41
  return smart_scraper_graph.run()
42
 
 
43
  def process_multiple_urls(urls):
44
  """
45
+ Process multiple URLs with progress tracking.
46
  """
47
  all_data = {"grants": []}
48
  progress_bar = st.progress(0)
49
  status_container = st.empty()
 
50
  total_urls = len(urls)
51
  for index, url in enumerate(urls):
52
  try:
 
54
  if not url:
55
  continue
56
 
 
57
  progress = (index + 1) / total_urls
58
  progress_bar.progress(progress)
 
 
59
  status_container.markdown(
60
  f"""
61
  **Processing URL {index+1} of {total_urls}**
 
64
  ⏳ Remaining: {total_urls - index - 1}
65
  """
66
  )
 
 
67
  result = get_data(url)
68
  if result and "grants" in result:
69
  all_data["grants"].extend(result["grants"])
70
  except Exception as e:
71
  st.error(f"Error processing {url}: {str(e)}")
72
  continue
 
73
  progress_bar.empty()
74
  status_container.empty()
75
  return all_data
76
 
 
77
  def convert_to_csv(data):
78
  df = pd.DataFrame(data["grants"])
79
  return df.to_csv(index=False).encode("utf-8")
80
 
 
81
  def convert_to_excel(data):
82
  df = pd.DataFrame(data["grants"])
83
  buffer = io.BytesIO()
 
85
  df.to_excel(writer, sheet_name="Grants", index=False)
86
  return buffer.getvalue()
87
 
 
88
  def create_knowledge_base(data):
89
  documents = []
90
  for grant in data["grants"]:
91
  doc_parts = [f"{key.replace('_', ' ').title()}: {value}" for key, value in grant.items()]
92
  documents.append("\n".join(doc_parts))
 
93
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
94
  texts = text_splitter.create_documents(documents)
 
95
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
96
  vectorstore = FAISS.from_documents(texts, embeddings)
 
97
  llm = ChatGoogleGenerativeAI(
98
  model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0
99
  )
100
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
101
  return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
102
 
 
103
  def get_shareable_link(file_data, file_name, file_type):
104
  b64 = base64.b64encode(file_data).decode()
105
  return f"data:{file_type};base64,{b64}"
106
 
107
+ def display_dashboard(data):
108
+ df = pd.DataFrame(data["grants"])
109
+ if df.empty:
110
+ st.info("No data available for dashboard.")
111
+ return
112
+ st.subheader("Grants Dashboard")
113
+ # Filtering by sector if available
114
+ if "sector" in df.columns:
115
+ sectors = df["sector"].dropna().unique().tolist()
116
+ selected_sector = st.selectbox("Select Sector", options=["All"] + sectors)
117
+ if selected_sector != "All":
118
+ df = df[df["sector"] == selected_sector]
119
+ # Visualizations: Distribution of Grant Values
120
+ if "value" in df.columns:
121
+ df["value"] = pd.to_numeric(df["value"], errors="coerce")
122
+ fig_value = px.histogram(df, x="value", nbins=20, title="Distribution of Grant Values")
123
+ st.plotly_chart(fig_value)
124
+ # Visualization: Grants by Organisation
125
+ if "organisation" in df.columns:
126
+ fig_org = px.pie(df, names="organisation", title="Grants by Organisation")
127
+ st.plotly_chart(fig_org)
128
+ st.dataframe(df)
129
+
130
+ def display_scrape_tab():
131
+ st.header("Scrape Grants")
132
+ url_input = st.text_area(
133
  "Enter URLs (one per line)",
134
  height=150,
135
  help="Enter multiple URLs separated by new lines",
136
  )
137
+ if st.button("Start Scraping"):
 
138
  if url_input:
139
  urls = [url.strip() for url in url_input.split("\n") if url.strip()]
140
  if urls:
141
+ with st.spinner("Scraping grants..."):
142
+ result = process_multiple_urls(urls)
143
+ st.session_state.scraped_data = result
144
+ st.success(f"Scraped {len(result['grants'])} grants from {len(urls)} URL(s)!")
 
 
 
145
  else:
146
  st.warning("Please enter valid URLs.")
147
  else:
148
  st.warning("Please enter at least one URL.")
149
 
150
+ def display_download_tab():
151
+ st.header("Download & Explore Data")
152
+ if st.session_state.get("scraped_data"):
153
  result = st.session_state.scraped_data
154
+ df = pd.DataFrame(result["grants"])
155
+ st.subheader(f"Data Preview ({len(df)} grants)")
156
+ st.dataframe(df)
157
+ selected_format = st.selectbox("Select Download Format", ("CSV", "Excel"))
158
  if selected_format == "CSV":
159
  file_data = convert_to_csv(result)
160
  file_name = "grants.csv"
 
163
  file_data = convert_to_excel(result)
164
  file_name = "grants.xlsx"
165
  file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 
166
  b64 = base64.b64encode(file_data).decode()
167
  download_link = f"<a href='data:{file_type};base64,{b64}' download='{file_name}'>Download {selected_format}</a>"
168
+ st.markdown(download_link, unsafe_allow_html=True)
 
169
  shareable_link = get_shareable_link(file_data, file_name, file_type)
170
+ st.markdown("---")
171
+ st.markdown("**Share Options:**")
 
172
  whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out this file: {shareable_link}')}"
173
+ st.markdown(f"📱 [Share via WhatsApp]({whatsapp_url})")
 
174
  email_subject = urllib.parse.quote("Check out this grants file")
175
  email_body = urllib.parse.quote(f"Download the file here: {shareable_link}")
176
  email_url = f"mailto:?subject={email_subject}&body={email_body}"
177
+ st.markdown(f"📧 [Share via Email]({email_url})")
178
+ else:
179
+ st.info("No scraped data available. Please scrape grants first.")
180
+
181
+ def display_chat_tab():
182
+ st.header("Knowledge Base Chat")
183
+ st.info("Ask questions about the grants data.")
184
+ if st.session_state.get("scraped_data"):
185
+ if st.button("Load Data as Knowledge Base"):
186
+ st.session_state.qa_chain = create_knowledge_base(st.session_state.scraped_data)
187
  st.session_state.chat_interface_active = True
188
+ st.session_state.chat_history = []
189
+ st.success("Knowledge base loaded!")
190
+ if st.session_state.get("chat_interface_active"):
191
+ query = st.text_input("Enter your query:")
192
+ if query:
 
 
 
 
193
  response = st.session_state.qa_chain({"question": query})
194
  st.session_state.chat_history.append({"query": query, "response": response["answer"]})
195
+ for chat in st.session_state.get("chat_history", []):
196
+ st.markdown(f"**You:** {chat['query']}")
197
+ st.markdown(f"**Grants Bot:** {chat['response']}")
198
+ else:
199
+ st.info("Load the knowledge base to start chatting.")
200
+ else:
201
+ st.info("No scraped data available. Please scrape grants first.")
202
+
203
+ def display_alerts_tab():
204
+ st.header("Automated Alerts Setup")
205
+ st.write("Configure your personalized alerts for new grant opportunities.")
206
+ keyword = st.text_input("Keyword for Alerts", help="E.g., 'AI', 'sustainable research'")
207
+ sector = st.text_input("Sector", help="E.g., 'health', 'technology'")
208
+ email = st.text_input("Your Email", help="Enter your email to receive alerts")
209
+ if st.button("Save Alert Preferences"):
210
+ # Placeholder for saving alert preferences—integrate with an email service in a full implementation.
211
+ st.success("Alert preferences saved! You will be notified when matching grants are found.")
212
+ st.info("Note: Automated alert functionality is under development and will be integrated soon.")
213
 
214
+ def main():
215
+ st.set_page_config(page_title="Quantilytix Grants Platform", layout="wide", initial_sidebar_state="expanded")
216
+ # Custom CSS styling for a modern look
217
+ st.markdown("""
218
+ <style>
219
+ .main {
220
+ background-color: #f5f5f5;
221
+ }
222
+ .sidebar .sidebar-content {
223
+ background-image: linear-gradient(#2e7bcf, #2e7bcf);
224
+ color: white;
225
+ }
226
+ </style>
227
+ """, unsafe_allow_html=True)
228
+
229
+ st.sidebar.title("Quantilytix Grants Platform")
230
+ st.sidebar.image("logoqb.jpeg", use_column_width=True)
231
+ app_mode = st.sidebar.radio("Navigation", ["Scrape Grants", "Download & Explore", "Dashboard", "Knowledge Base Chat", "Automated Alerts"])
232
+
233
+ if app_mode == "Scrape Grants":
234
+ display_scrape_tab()
235
+ elif app_mode == "Download & Explore":
236
+ display_download_tab()
237
+ elif app_mode == "Dashboard":
238
+ if st.session_state.get("scraped_data"):
239
+ display_dashboard(st.session_state.scraped_data)
240
+ else:
241
+ st.info("No data available. Please scrape grants first.")
242
+ elif app_mode == "Knowledge Base Chat":
243
+ display_chat_tab()
244
+ elif app_mode == "Automated Alerts":
245
+ display_alerts_tab()
246
 
247
  if __name__ == "__main__":
248
+ if "scraped_data" not in st.session_state:
249
+ st.session_state.scraped_data = None
250
+ if "chat_history" not in st.session_state:
251
+ st.session_state.chat_history = []
252
+ if "chat_interface_active" not in st.session_state:
253
+ st.session_state.chat_interface_active = False
254
+ main()