rairo commited on
Commit
605e112
·
verified ·
1 Parent(s): 0eb897b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -45
app.py CHANGED
@@ -9,6 +9,7 @@ import subprocess
9
  import io
10
  import time
11
  import urllib.parse
 
12
  from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
13
  from langchain.vectorstores import FAISS
14
  from langchain.text_splitter import CharacterTextSplitter
@@ -20,6 +21,9 @@ from langchain_community.document_loaders import PlaywrightURLLoader
20
  import requests
21
  # Import Supadata and initialize the client
22
  from supadata import Supadata, SupadataError
 
 
 
23
  SUPADATA_API_KEY = os.getenv("SUPADATA")
24
  supadata = Supadata(api_key=SUPADATA_API_KEY)
25
 
@@ -39,6 +43,7 @@ graph_config = {
39
  "headless": True
40
  }
41
 
 
42
  def get_data(search_term):
43
  """
44
  Run the SearchGraph for a given search term.
@@ -94,56 +99,98 @@ def get_data(search_term):
94
 
95
  SUPADATA_API_KEY = os.getenv("SUPADATA")
96
 
97
- def get_data_from_url(url):
98
-
99
- loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
100
-
101
- data = loader.aload()
102
-
103
- test_s = data
104
  """
105
- Scrape the provided URL using Supadata. If it fails, fall back to the Supadata API,
106
- and if that fails, fall back to a direct request. Extract grant data using Gemini AI.
 
 
 
 
 
 
107
  """
108
  page_content = None # Placeholder for storing scraped page content
109
 
110
- # **Step 1: Attempt Supadata's Built-in Scraper**
111
- try:
112
- web_content = supadata.web.scrape(url)
113
- page_content = web_content.content
114
- except TypeError as te:
115
- if "unexpected keyword argument 'type'" in str(te):
116
- st.warning("Falling back to Supadata API due to unexpected keyword 'type' error.")
117
- else:
118
- st.error(f"Unexpected error in Supadata scrape: {te}, {test_s}")
119
-
120
- # **Step 2: If Supadata's Built-in Scraper Fails, Use Supadata API**
121
- if not page_content:
122
  try:
123
- api_url = "https://api.supadata.ai/v1/web/scrape"
124
- headers = {"X-API-Key": SUPADATA_API_KEY}
125
- response = requests.get(api_url, headers=headers, params={"url": url})
 
 
126
 
127
- if response.status_code == 200:
128
- page_content = response.json().get("content", "")
129
- else:
130
- st.error(f"Supadata API failed with status {response.status_code} data: {test_s} ")
 
 
 
131
  except Exception as e:
132
- st.error(f"Error calling Supadata API: {e}, data: {test_s}")
133
-
134
- # **Step 3: If Supadata API Fails, Use Direct Web Request**
135
- if not page_content:
 
 
136
  try:
137
- r = requests.get(url, timeout=10)
138
- if r.status_code == 200:
139
- page_content = r.text
140
- st.success(f"{test_s}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  else:
142
- st.error(f"Manual scraping failed with status code {r.status_code}, data:{test_s}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  return {}
144
- except Exception as e:
145
- st.error(f"Manual scraping error: {e}, data:{test_s}")
146
- return {}
 
 
147
 
148
  # **Pass Content to Gemini AI**
149
  full_prompt = (
@@ -191,8 +238,6 @@ def get_data_from_url(url):
191
 
192
 
193
 
194
-
195
-
196
  def process_multiple_search_terms(search_terms):
197
  """
198
  Process multiple search terms with progress tracking.
@@ -310,6 +355,13 @@ def main():
310
  "Enter URL to scrape for grant opportunities",
311
  placeholder="https://example.com/grants"
312
  )
 
 
 
 
 
 
 
313
 
314
  # Execute based on input type selection
315
  if input_type == "Search Query":
@@ -329,8 +381,8 @@ def main():
329
  else: # URL input
330
  if st.sidebar.button("🔍 Scrape URL for Grant Opportunities"):
331
  if url_input:
332
- with st.spinner("Scraping URL... Please wait patiently."):
333
- result = get_data_from_url(url_input)
334
  st.session_state.scraped_data = result
335
  if result.get("grants"):
336
  st.sidebar.success(f"✅ Found {len(result['grants'])} grant opportunities from the URL!")
@@ -408,4 +460,4 @@ def main():
408
  )
409
 
410
  if __name__ == "__main__":
411
- main()
 
9
  import io
10
  import time
11
  import urllib.parse
12
+ import asyncio
13
  from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
14
  from langchain.vectorstores import FAISS
15
  from langchain.text_splitter import CharacterTextSplitter
 
21
  import requests
22
  # Import Supadata and initialize the client
23
  from supadata import Supadata, SupadataError
24
+ # Import Crawl4AI
25
+ from crawl4ai import AsyncWebCrawler
26
+
27
  SUPADATA_API_KEY = os.getenv("SUPADATA")
28
  supadata = Supadata(api_key=SUPADATA_API_KEY)
29
 
 
43
  "headless": True
44
  }
45
 
46
+
47
  def get_data(search_term):
48
  """
49
  Run the SearchGraph for a given search term.
 
99
 
100
  SUPADATA_API_KEY = os.getenv("SUPADATA")
101
 
102
+ def get_data_from_url(url, scraping_tool="supadata"):
 
 
 
 
 
 
103
  """
104
+ Scrape the provided URL using the selected scraping tool.
105
+
106
+ Args:
107
+ url: The URL to scrape
108
+ scraping_tool: Either "supadata", "crawl4ai", or "playwright"
109
+
110
+ Returns:
111
+ Dictionary containing the extracted grant data
112
  """
113
  page_content = None # Placeholder for storing scraped page content
114
 
115
+ # Choose the scraping method based on the selected tool
116
+ if scraping_tool == "crawl4ai":
 
 
 
 
 
 
 
 
 
 
117
  try:
118
+ # Use Crawl4AI for scraping
119
+ async def run_crawler():
120
+ async with AsyncWebCrawler() as crawler:
121
+ result = await crawler.arun(url=url)
122
+ return result.markdown
123
 
124
+ # Run the async crawler in a synchronous context
125
+ loop = asyncio.new_event_loop()
126
+ asyncio.set_event_loop(loop)
127
+ page_content = loop.run_until_complete(run_crawler())
128
+ loop.close()
129
+
130
+ st.success("Successfully scraped using Crawl4AI")
131
  except Exception as e:
132
+ st.error(f"Error using Crawl4AI: {e}")
133
+ # Fall back to Supadata if Crawl4AI fails
134
+ st.warning("Falling back to Supadata scraper...")
135
+ scraping_tool = "supadata"
136
+
137
+ if scraping_tool == "playwright":
138
  try:
139
+ loader = PlaywrightURLLoader(urls=[url], remove_selectors=["header", "footer"])
140
+ data = loader.aload()
141
+ page_content = data[0].page_content if data else ""
142
+ st.success("Successfully scraped using Playwright")
143
+ except Exception as e:
144
+ st.error(f"Error using Playwright: {e}")
145
+ # Fall back to Supadata if Playwright fails
146
+ st.warning("Falling back to Supadata scraper...")
147
+ scraping_tool = "supadata"
148
+
149
+ if scraping_tool == "supadata":
150
+ # **Step 1: Attempt Supadata's Built-in Scraper**
151
+ try:
152
+ web_content = supadata.web.scrape(url)
153
+ page_content = web_content.content
154
+ st.success("Successfully scraped using Supadata built-in scraper")
155
+ except TypeError as te:
156
+ if "unexpected keyword argument 'type'" in str(te):
157
+ st.warning("Falling back to Supadata API due to unexpected keyword 'type' error.")
158
  else:
159
+ st.error(f"Unexpected error in Supadata scrape: {te}")
160
+
161
+ # **Step 2: If Supadata's Built-in Scraper Fails, Use Supadata API**
162
+ if not page_content:
163
+ try:
164
+ api_url = "https://api.supadata.ai/v1/web/scrape"
165
+ headers = {"X-API-Key": SUPADATA_API_KEY}
166
+ response = requests.get(api_url, headers=headers, params={"url": url})
167
+
168
+ if response.status_code == 200:
169
+ page_content = response.json().get("content", "")
170
+ st.success("Successfully scraped using Supadata API")
171
+ else:
172
+ st.error(f"Supadata API failed with status {response.status_code}")
173
+ except Exception as e:
174
+ st.error(f"Error calling Supadata API: {e}")
175
+
176
+ # **Step 3: If Supadata API Fails, Use Direct Web Request**
177
+ if not page_content:
178
+ try:
179
+ r = requests.get(url, timeout=10)
180
+ if r.status_code == 200:
181
+ page_content = r.text
182
+ st.success("Successfully retrieved content with direct request")
183
+ else:
184
+ st.error(f"Manual scraping failed with status code {r.status_code}")
185
+ return {}
186
+ except Exception as e:
187
+ st.error(f"Manual scraping error: {e}")
188
  return {}
189
+
190
+ # If we still don't have content after all attempts
191
+ if not page_content:
192
+ st.error("Failed to retrieve content from the URL with all available methods")
193
+ return {}
194
 
195
  # **Pass Content to Gemini AI**
196
  full_prompt = (
 
238
 
239
 
240
 
 
 
241
  def process_multiple_search_terms(search_terms):
242
  """
243
  Process multiple search terms with progress tracking.
 
355
  "Enter URL to scrape for grant opportunities",
356
  placeholder="https://example.com/grants"
357
  )
358
+
359
+ # Scraping tool selector
360
+ scraping_tool = st.sidebar.radio(
361
+ "Select Scraping Tool:",
362
+ ("Supadata", "Crawl4AI", "Playwright"),
363
+ key="scraping_tool_selector"
364
+ )
365
 
366
  # Execute based on input type selection
367
  if input_type == "Search Query":
 
381
  else: # URL input
382
  if st.sidebar.button("🔍 Scrape URL for Grant Opportunities"):
383
  if url_input:
384
+ with st.spinner(f"Scraping URL using {scraping_tool}... Please wait patiently."):
385
+ result = get_data_from_url(url_input, scraping_tool.lower())
386
  st.session_state.scraped_data = result
387
  if result.get("grants"):
388
  st.sidebar.success(f"✅ Found {len(result['grants'])} grant opportunities from the URL!")
 
460
  )
461
 
462
  if __name__ == "__main__":
463
+ main()