rairo commited on
Commit
45e9c8a
·
verified ·
1 Parent(s): 605e112

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -451
app.py CHANGED
@@ -1,463 +1,59 @@
1
  import streamlit as st
2
- import pandas as pd
3
- import base64
4
- import json
5
- from scrapegraphai.graphs import SearchGraph
6
- import nest_asyncio
7
- import os
8
- import subprocess
9
- import io
10
- import time
11
- import urllib.parse
12
  import asyncio
13
- from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
14
- from langchain.vectorstores import FAISS
15
- from langchain.text_splitter import CharacterTextSplitter
16
- from langchain.chains import ConversationalRetrievalChain
17
- from langchain.memory import ConversationBufferMemory
18
- from google import genai
19
- from google.genai import types
20
- from langchain_community.document_loaders import PlaywrightURLLoader
21
- import requests
22
- # Import Supadata and initialize the client
23
- from supadata import Supadata, SupadataError
24
- # Import Crawl4AI
25
  from crawl4ai import AsyncWebCrawler
26
 
27
- SUPADATA_API_KEY = os.getenv("SUPADATA")
28
- supadata = Supadata(api_key=SUPADATA_API_KEY)
29
 
30
- # Ensure Playwright installs required browsers and dependencies
31
- subprocess.run(["playwright", "install"])
32
- nest_asyncio.apply()
33
 
34
- GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
 
35
 
36
- graph_config = {
37
- "llm": {
38
- "api_key": GOOGLE_API_KEY,
39
- "model": "google_genai/gemini-2.0-flash-lite",
40
- },
41
- "max_results": 8,
42
- "verbose": True,
43
- "headless": True
44
- }
45
 
46
-
47
- def get_data(search_term):
48
- """
49
- Run the SearchGraph for a given search term.
50
- If a rate-limit error (202) occurs, wait 10 seconds and retry.
51
- If no results are returned or an error persists, notify the user.
52
- """
53
- full_prompt = (
54
- f"search for {search_term} grants\n\n"
55
- "List me all grants or funds with:\n"
56
- "- Grant name/title\n"
57
- "- Short summary \n"
58
- "- Funding organization\n"
59
- "- Grant value (numeric only)\n"
60
- "- Application deadline\n"
61
- "- Eligible countries\n"
62
- "- Sector/field\n"
63
- "- Eligibility criteria\n"
64
- "Return in JSON format."
65
- )
66
- try:
67
- search_graph = SearchGraph(
68
- prompt=full_prompt,
69
- config=graph_config,
70
  )
71
- result = search_graph.run()
72
- if not result or not result.get("grants"):
73
- st.error(f"No results returned for {search_term}. Please try again with a different search term.")
74
- return {}
75
- return result
76
- except Exception as e:
77
- err_str = str(e)
78
- if "202" in err_str:
79
- st.warning("Rate limit reached (202). Waiting 10 seconds before retrying...")
80
- time.sleep(10)
81
- try:
82
- search_graph = SearchGraph(
83
- prompt=full_prompt,
84
- config=graph_config,
85
- )
86
- result = search_graph.run()
87
- if not result or not result.get("grants"):
88
- st.error(f"No results returned for {search_term}. Please try again with a different search term.")
89
- return {}
90
- return result
91
- except Exception as e2:
92
- st.error(f"Retry failed: {e2}. Please try again later.")
93
- return {}
94
- else:
95
- st.error(f"An error occurred for search term: {search_term}, error: {e}. Please try again.")
96
- return {}
97
-
98
-
99
-
100
- SUPADATA_API_KEY = os.getenv("SUPADATA")
101
-
102
- def get_data_from_url(url, scraping_tool="supadata"):
103
- """
104
- Scrape the provided URL using the selected scraping tool.
105
-
106
- Args:
107
- url: The URL to scrape
108
- scraping_tool: Either "supadata", "crawl4ai", or "playwright"
109
-
110
- Returns:
111
- Dictionary containing the extracted grant data
112
- """
113
- page_content = None # Placeholder for storing scraped page content
114
-
115
- # Choose the scraping method based on the selected tool
116
- if scraping_tool == "crawl4ai":
117
  try:
118
- # Use Crawl4AI for scraping
119
- async def run_crawler():
120
- async with AsyncWebCrawler() as crawler:
121
- result = await crawler.arun(url=url)
122
- return result.markdown
123
-
124
- # Run the async crawler in a synchronous context
125
- loop = asyncio.new_event_loop()
126
- asyncio.set_event_loop(loop)
127
- page_content = loop.run_until_complete(run_crawler())
128
- loop.close()
129
-
130
- st.success("Successfully scraped using Crawl4AI")
131
- except Exception as e:
132
- st.error(f"Error using Crawl4AI: {e}")
133
- # Fall back to Supadata if Crawl4AI fails
134
- st.warning("Falling back to Supadata scraper...")
135
- scraping_tool = "supadata"
136
-
137
- if scraping_tool == "playwright":
138
- try:
139
- loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
140
- data = loader.aload()
141
- page_content = data[0].page_content if data else ""
142
- st.success("Successfully scraped using Playwright")
143
  except Exception as e:
144
- st.error(f"Error using Playwright: {e}")
145
- # Fall back to Supadata if Playwright fails
146
- st.warning("Falling back to Supadata scraper...")
147
- scraping_tool = "supadata"
148
-
149
- if scraping_tool == "supadata":
150
- # **Step 1: Attempt Supadata's Built-in Scraper**
151
- try:
152
- web_content = supadata.web.scrape(url)
153
- page_content = web_content.content
154
- st.success("Successfully scraped using Supadata built-in scraper")
155
- except TypeError as te:
156
- if "unexpected keyword argument 'type'" in str(te):
157
- st.warning("Falling back to Supadata API due to unexpected keyword 'type' error.")
158
- else:
159
- st.error(f"Unexpected error in Supadata scrape: {te}")
160
-
161
- # **Step 2: If Supadata's Built-in Scraper Fails, Use Supadata API**
162
- if not page_content:
163
- try:
164
- api_url = "https://api.supadata.ai/v1/web/scrape"
165
- headers = {"X-API-Key": SUPADATA_API_KEY}
166
- response = requests.get(api_url, headers=headers, params={"url": url})
167
-
168
- if response.status_code == 200:
169
- page_content = response.json().get("content", "")
170
- st.success("Successfully scraped using Supadata API")
171
- else:
172
- st.error(f"Supadata API failed with status {response.status_code}")
173
- except Exception as e:
174
- st.error(f"Error calling Supadata API: {e}")
175
-
176
- # **Step 3: If Supadata API Fails, Use Direct Web Request**
177
- if not page_content:
178
- try:
179
- r = requests.get(url, timeout=10)
180
- if r.status_code == 200:
181
- page_content = r.text
182
- st.success("Successfully retrieved content with direct request")
183
- else:
184
- st.error(f"Manual scraping failed with status code {r.status_code}")
185
- return {}
186
- except Exception as e:
187
- st.error(f"Manual scraping error: {e}")
188
- return {}
189
-
190
- # If we still don't have content after all attempts
191
- if not page_content:
192
- st.error("Failed to retrieve content from the URL with all available methods")
193
- return {}
194
-
195
- # **Pass Content to Gemini AI**
196
- full_prompt = (
197
- "Extract the following grant data from the provided web content. "
198
- "- Grant name/title\n"
199
- "- Short summary\n"
200
- "- Funding organization\n"
201
- "- Grant value (numeric only)\n"
202
- "- Application deadline\n"
203
- "- Eligible countries\n"
204
- "- Sector/field\n"
205
- "- Eligibility criteria\n"
206
- "Return in JSON format.\n\n"
207
- f"Web content: {page_content}"
208
- )
209
-
210
- client = genai.Client(api_key=GOOGLE_API_KEY)
211
- new_answer = client.models.generate_content(
212
- model="models/gemini-2.0-flash-lite",
213
- contents=f"{full_prompt}, return the json string and nothing else"
214
- )
215
-
216
- response = new_answer.text
217
-
218
- # **Extract JSON Output from Gemini**
219
- try:
220
- start_index = response.find('[')
221
- end_index = response.rfind(']') + 1
222
- json_string = response[start_index:end_index]
223
- result = json.loads(json_string)
224
- except Exception as parse_error:
225
- st.error(f"Error parsing JSON from Gemini model response. Response: {response}")
226
- return {}
227
-
228
- # **Ensure JSON is Wrapped Correctly**
229
- if isinstance(result, list):
230
- result = {"grants": result}
231
-
232
- if not result.get("grants"):
233
- st.error("No grant opportunities found in the scraped URL.")
234
- return {}
235
-
236
- st.success(f"First grant opportunity: {result['grants'][0]}")
237
- return result
238
-
239
-
240
-
241
- def process_multiple_search_terms(search_terms):
242
- """
243
- Process multiple search terms with progress tracking.
244
- Returns a dictionary with a 'grants' key containing combined results.
245
- """
246
- all_data = {"grants": []}
247
- progress_bar = st.progress(0)
248
- status_container = st.empty()
249
- total_terms = len(search_terms)
250
-
251
- for index, term in enumerate(search_terms):
252
- term = term.strip()
253
- if not term:
254
- continue
255
-
256
- progress = (index + 1) / total_terms
257
- progress_bar.progress(progress)
258
- status_container.markdown(
259
- f"""
260
- **Processing Grant Opportunities** 🚀
261
- Searching term {index+1} of {total_terms}: `{term}`
262
- <br>
263
- <p style='font-size: 0.9em; color: #6699CC;'>Completed: {index}/{total_terms} | Remaining: {total_terms - index - 1}</p>
264
- """,
265
- unsafe_allow_html=True,
266
- )
267
-
268
- result = get_data(term)
269
- if result and result.get("grants"):
270
- all_data["grants"].extend(result["grants"])
271
- progress_bar.empty()
272
- status_container.empty()
273
- if not all_data["grants"]:
274
- st.error("No grant opportunities were found. Please try again with different search terms.")
275
- return all_data
276
-
277
- def convert_to_csv(data):
278
- df = pd.DataFrame(data["grants"])
279
- return df.to_csv(index=False).encode("utf-8")
280
-
281
- def convert_to_excel(data):
282
- df = pd.DataFrame(data["grants"])
283
- buffer = io.BytesIO()
284
- with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
285
- df.to_excel(writer, sheet_name="Grants", index=False)
286
- return buffer.getvalue()
287
-
288
- def create_knowledge_base(data):
289
- # Store JSON representation of data in session state
290
- st.session_state.knowledge_base_json = json.dumps(data, indent=2)
291
-
292
- def chat_with_knowledge_base(query):
293
- if "knowledge_base_json" not in st.session_state:
294
- return "Knowledge base not initialized. Please load grant data first."
295
-
296
- context = st.session_state.knowledge_base_json
297
- prompt = f"""
298
- You are an AI assistant that helps users analyze grant opportunities.
299
- Here is the extracted grant data in JSON format:
300
-
301
- {context}
302
-
303
- User's question: {query}
304
- Answer the question based on the provided grant data.
305
- """
306
- llm = ChatGoogleGenerativeAI(
307
- model="gemini-2.0-flash-thinking-exp", google_api_key=GOOGLE_API_KEY, temperature=0
308
- )
309
- response = llm.invoke(prompt)
310
- return response.content
311
-
312
- def get_shareable_link(file_data, file_name, file_type):
313
- b64 = base64.b64encode(file_data).decode()
314
- return f"data:{file_type};base64,{b64}"
315
-
316
- def main():
317
- st.set_page_config(page_title="Quantilytix Grant Finder", page_icon="💰", layout="wide")
318
- st.title("💰 Quantilytix Grant Finder")
319
- st.markdown("""
320
- <div style="text-align: justify;">
321
- <p>
322
- Welcome to <b>Quantilytix Grant Finder</b>, an AI-powered platform designed to streamline the grant discovery process, especially for academics and researchers across the globe.
323
- </p>
324
- </div>
325
- """, unsafe_allow_html=True)
326
-
327
- # Sidebar controls
328
- st.sidebar.image("logoqb.jpeg", use_container_width=True)
329
- st.sidebar.header("Scrape & Configure")
330
-
331
- if "scraped_data" not in st.session_state:
332
- st.session_state.scraped_data = None
333
- if "chat_history" not in st.session_state:
334
- st.session_state.chat_history = []
335
- if "chat_interface_active" not in st.session_state:
336
- st.session_state.chat_interface_active = False
337
-
338
- # Sidebar: Input Type Selection
339
- input_type = st.sidebar.radio(
340
- "Select Input Type:",
341
- ("Search Query", "URL"),
342
- key="input_type_selector"
343
- )
344
-
345
- # Sidebar: Input field based on selection
346
- if input_type == "Search Query":
347
- search_input = st.sidebar.text_area(
348
- "Enter Search Terms (one per line). Maximum 2",
349
- height=150,
350
- help="Input search terms to discover grant opportunities. Terms can be specific or generic.",
351
- placeholder="e.g.,\nRenewable energy \nclimate change research\nAgriculture in Africa"
352
- )
353
- else:
354
- url_input = st.sidebar.text_input(
355
- "Enter URL to scrape for grant opportunities",
356
- placeholder="https://example.com/grants"
357
- )
358
-
359
- # Scraping tool selector
360
- scraping_tool = st.sidebar.radio(
361
- "Select Scraping Tool:",
362
- ("Supadata", "Crawl4AI", "Playwright"),
363
- key="scraping_tool_selector"
364
- )
365
-
366
- # Execute based on input type selection
367
- if input_type == "Search Query":
368
- if st.sidebar.button("🔍 Get Grant Opportunities"):
369
- if search_input:
370
- search_terms = [term.strip() for term in search_input.split("\n") if term.strip()]
371
- if search_terms:
372
- with st.spinner("Searching in progress... Please wait patiently."):
373
- result = process_multiple_search_terms(search_terms)
374
- st.session_state.scraped_data = result
375
- if result.get("grants"):
376
- st.sidebar.success(f"✅ Found {len(result['grants'])} grant opportunities from {len(search_terms)} search terms!")
377
- else:
378
- st.sidebar.warning("⚠️ Please enter valid search terms.")
379
- else:
380
- st.sidebar.warning("⚠️ Please enter at least one search term to begin.")
381
- else: # URL input
382
- if st.sidebar.button("🔍 Scrape URL for Grant Opportunities"):
383
- if url_input:
384
- with st.spinner(f"Scraping URL using {scraping_tool}... Please wait patiently."):
385
- result = get_data_from_url(url_input, scraping_tool.lower())
386
- st.session_state.scraped_data = result
387
- if result.get("grants"):
388
- st.sidebar.success(f"✅ Found {len(result['grants'])} grant opportunities from the URL!")
389
- else:
390
- st.sidebar.warning("⚠️ Please enter a valid URL to scrape.")
391
-
392
- # Sidebar: Download & Share Controls
393
- if st.session_state.scraped_data and st.session_state.scraped_data.get('grants'):
394
- st.sidebar.markdown("---")
395
- st.sidebar.subheader("Download & Share")
396
- selected_format = st.sidebar.selectbox("Download As:", ("CSV", "Excel"), key="download_format_selector")
397
- if selected_format == "CSV":
398
- file_data = convert_to_csv(st.session_state.scraped_data)
399
- file_name = "grants_data.csv"
400
- file_type = "text/csv"
401
- else:
402
- file_data = convert_to_excel(st.session_state.scraped_data)
403
- file_name = "grants_data.xlsx"
404
- file_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
405
- download_link_html = f"<a href='data:{file_type};base64,{base64.b64encode(file_data).decode()}' download='{file_name}'><button style='background-color:#4CAF50;color:white;padding:10px 15px;border:none;border-radius:4px;'>⬇️ Download {selected_format}</button></a>"
406
- st.sidebar.markdown(download_link_html, unsafe_allow_html=True)
407
- shareable_link = get_shareable_link(file_data, file_name, file_type)
408
- whatsapp_url = f"https://api.whatsapp.com/send?text={urllib.parse.quote(f'Check out these grant opportunities: {shareable_link}')}"
409
- email_subject = urllib.parse.quote("Grant Opportunities File")
410
- email_body = urllib.parse.quote(f"Download the grant opportunities file here: {shareable_link}")
411
- email_url = f"mailto:?subject={email_subject}&body={email_body}"
412
- st.sidebar.markdown("<div style='margin-top:10px;'>Share via:</div>", unsafe_allow_html=True)
413
- st.sidebar.markdown(f"📱 [WhatsApp]({whatsapp_url}) | 📧 [Email]({email_url})", unsafe_allow_html=True)
414
-
415
- # Sidebar: Load as Knowledge Base & Chat
416
- if st.sidebar.button("🧠 Load as Knowledge Base & Chat"):
417
- with st.spinner("Loading data into knowledge base..."):
418
- create_knowledge_base(st.session_state.scraped_data)
419
- st.session_state.chat_interface_active = True
420
- st.session_state.chat_history = []
421
- st.sidebar.success("Knowledge base loaded!")
422
-
423
- # Main area: Data Preview
424
- st.markdown("---")
425
- if st.session_state.scraped_data and st.session_state.scraped_data.get('grants'):
426
- st.header("📊 Found Grant Data")
427
- with st.expander(f"📊 Preview Grant Data ({len(st.session_state.scraped_data['grants'])} grants)"):
428
- st.dataframe(st.session_state.scraped_data["grants"])
429
-
430
- # Main area: Chat UI (shown if knowledge base is loaded)
431
- if st.session_state.get("chat_interface_active"):
432
- st.header("💬 Chat with Grants Bot")
433
- query = st.text_input("Your question:", key="chat_input_main")
434
- if query:
435
- with st.spinner("Generating response..."):
436
- response = chat_with_knowledge_base(query)
437
- answer = response["answer"] if isinstance(response, dict) and "answer" in response else response
438
- st.session_state.chat_history.append({"query": query, "response": answer})
439
-
440
- if st.session_state.chat_history:
441
- st.subheader("Chat History")
442
- for chat in reversed(st.session_state.chat_history):
443
- st.markdown(
444
- f"<div style='padding: 10px; border-radius: 5px; margin-bottom: 5px; background-color:#444444; color: white;'><strong>You:</strong> {chat['query']}</div>",
445
- unsafe_allow_html=True)
446
- st.markdown(
447
- f"<div style='padding: 10px; border-radius: 5px; margin-bottom: 10px; background-color:#007BFF; color: white;'><strong>Grants Bot:</strong> {chat['response']}</div>",
448
- unsafe_allow_html=True)
449
- else:
450
- st.info("⬅️ Enter search terms or a URL in the sidebar and click the appropriate button to start searching.")
451
-
452
- st.sidebar.markdown("---")
453
- st.sidebar.markdown(
454
- """
455
- <div style='text-align: center; font-size: 0.8em; color: grey;'>
456
- Powered by <a href="https://quantilytix.com" style='color: grey;'>Quantilytix</a> | &copy; 2025
457
- </div>
458
- """,
459
- unsafe_allow_html=True,
460
- )
461
 
462
- if __name__ == "__main__":
463
- main()
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
2
  import asyncio
 
 
 
 
 
 
 
 
 
 
 
 
3
  from crawl4ai import AsyncWebCrawler
4
 
5
+ st.set_page_config(page_title="Web Crawler App", layout="wide")
 
6
 
7
+ st.title("Web Crawler App")
 
 
8
 
9
+ # Input for URL
10
+ url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")
11
 
12
+ # Optional parameters
13
+ with st.expander("Advanced Options"):
14
+ max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
15
+ timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
16
+ max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
 
 
 
 
17
 
18
+ # Function to run the crawler
19
+ async def run_crawler(url, max_depth=1, timeout=30, max_pages=10):
20
+ async with AsyncWebCrawler() as crawler:
21
+ result = await crawler.arun(
22
+ url=url,
23
+ max_depth=max_depth,
24
+ timeout=timeout,
25
+ max_pages=max_pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  )
27
+ return result.markdown
28
+
29
+ # Button to start crawling
30
+ if st.button("Start Crawling"):
31
+ with st.spinner("Crawling in progress..."):
32
+ # We need to run the async function in a way that works with Streamlit
33
+ loop = asyncio.new_event_loop()
34
+ asyncio.set_event_loop(loop)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  try:
36
+ result = loop.run_until_complete(run_crawler(
37
+ url=url,
38
+ max_depth=max_depth,
39
+ timeout=timeout,
40
+ max_pages=max_pages
41
+ ))
42
+ # Display the results
43
+ st.subheader("Crawl Results")
44
+ st.markdown(result)
45
+ # Option to download results
46
+ st.download_button(
47
+ label="Download Results",
48
+ data=result,
49
+ file_name="crawl_results.md",
50
+ mime="text/markdown"
51
+ )
 
 
 
 
 
 
 
 
 
52
  except Exception as e:
53
+ st.error(f"An error occurred: {str(e)}")
54
+ finally:
55
+ loop.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Add footer with information
58
+ st.markdown("---")
59
+ st.markdown("This app uses the crawl4ai library to extract content from web pages.")