rairo commited on
Commit
b7d111d
·
verified ·
1 Parent(s): 8a41058

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +240 -120
main.py CHANGED
@@ -2,18 +2,11 @@ from flask import Flask, request, jsonify
2
  import os
3
  import json
4
  import time
5
- import subprocess
6
- import nest_asyncio
7
- import requests # For API fallback call to Supadata
8
- from scrapegraphai.graphs import SearchGraph
9
  from flask_cors import CORS
10
  from google import genai
11
  from google.genai import types
12
- from supadata import Supadata, SupadataError
13
-
14
- # Ensure Playwright installs required browsers and dependencies
15
- subprocess.run(["playwright", "install"])
16
- nest_asyncio.apply()
17
 
18
  app = Flask(__name__)
19
  CORS(app)
@@ -23,63 +16,104 @@ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
23
  if not GOOGLE_API_KEY:
24
  raise ValueError("GOOGLE_API_KEY environment variable is not set.")
25
 
26
- SUPADATA_API_KEY = os.environ.get("SUPADATA")
27
- if not SUPADATA_API_KEY:
28
- raise ValueError("SUPADATA_API_KEY environment variable is not set.")
29
 
30
- # Initialize Supadata client
31
- supadata = Supadata(api_key=SUPADATA_API_KEY)
 
32
 
33
- graph_config = {
34
- "llm": {
35
- "api_key": GOOGLE_API_KEY,
36
- "model": "google_genai/gemini-2.0-flash-lite",
37
- },
38
- "max_results": 8,
39
- "verbose": True,
40
- "headless": True
41
- }
42
 
43
 
44
  def get_data(search_term):
45
  """
46
- Run the SearchGraph for a given search term.
47
- If a rate-limit error (202) occurs, wait 10 seconds and retry.
48
  """
49
- full_prompt = (
50
- f"search for {search_term} grants\n\n"
51
- "List me all grants or funds with:\n"
52
- "- Grant name/title\n"
53
- "- Short summary \n"
54
- "- Funding organization\n"
55
- "- Grant value (numeric only)\n"
56
- "- Application deadline\n"
57
- "- Eligible countries\n"
58
- "- Sector/field\n"
59
- "- Eligibility criteria\n"
60
- "- link URL\n"
61
- "Return in JSON format."
62
- )
63
 
64
  print("\n=== DEBUG: Start get_data() ===")
65
  print(f"Search Term: {search_term}")
66
- print(f"Full Prompt:\n{full_prompt}\n")
67
 
68
  try:
69
- search_graph = SearchGraph(prompt=full_prompt, config=graph_config)
70
- result = search_graph.run()
71
- print("\n=== DEBUG: Raw result from search_graph.run() ===")
72
- print(result)
 
 
 
 
 
73
  print("===========================================")
74
 
75
- # Ensure result is in dictionary format
76
- if isinstance(result, str):
77
- try:
78
- result = json.loads(result)
79
- print("DEBUG: Successfully parsed JSON string into dictionary.")
80
- except json.JSONDecodeError:
81
- print("ERROR: Failed to parse JSON from search result.")
82
- return {"error": "Failed to parse JSON from search result."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  if not result or "grants" not in result or not result["grants"]:
85
  print(f"DEBUG: No grants found for '{search_term}'.")
@@ -92,20 +126,81 @@ def get_data(search_term):
92
  err_str = str(e)
93
  print(f"ERROR: Exception occurred - {err_str}")
94
 
95
- if "202" in err_str:
96
- print("DEBUG: Rate limit (202) detected. Retrying in 10 seconds...")
 
97
  time.sleep(10)
98
  try:
99
- search_graph = SearchGraph(prompt=full_prompt, config=graph_config)
100
- result = search_graph.run()
101
- print("\n=== DEBUG: Retrying search_graph.run() ===")
102
- print(result)
103
- print("===========================================")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  if not result or "grants" not in result or not result["grants"]:
105
  print(f"DEBUG: No grants found after retry for '{search_term}'.")
106
  return {"error": f"No results returned for '{search_term}' after retry. Please try again with a different search term."}
 
107
  print("DEBUG: Grants found on retry, returning results.")
108
  return result
 
109
  except Exception as e2:
110
  print(f"ERROR: Retry failed - {str(e2)}")
111
  return {"error": f"Retry failed for '{search_term}': {str(e2)}. Please try again later."}
@@ -155,80 +250,105 @@ def scrape():
155
 
156
  def get_data_from_url(url):
157
  """
158
- Scrape the provided URL using Supadata. If it fails, fall back to the Supadata API.
159
  Extract grant data using Gemini AI.
160
  """
161
- page_content = None # Placeholder for storing scraped page content
 
162
 
163
- # Step 1: Attempt Supadata's built-in scraper
164
  try:
165
- web_content = supadata.web.scrape(url)
166
- page_content = web_content.content
167
- except TypeError as te:
168
- if "unexpected keyword argument 'type'" in str(te):
169
- print("Falling back to Supadata API due to unexpected keyword 'type' error.")
 
 
 
 
 
 
 
 
 
 
 
170
  else:
171
- print(f"Unexpected error in Supadata scrape: {te}")
172
 
173
- # Step 2: If Supadata's built-in scraper fails, use Supadata API
174
- if not page_content:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  try:
176
- api_url = "https://api.supadata.ai/v1/web/scrape"
177
- headers = {"X-API-Key": SUPADATA_API_KEY}
178
- response = requests.get(api_url, headers=headers, params={"url": url})
179
- if response.status_code == 200:
180
- page_content = response.json().get("content", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  else:
182
- print(f"Supadata API failed with status {response.status_code}")
183
- return {}
184
- except Exception as e:
185
- print(f"Error calling Supadata API: {e}")
 
186
  return {}
187
 
188
- # Pass content to Gemini AI
189
- full_prompt = (
190
- "Extract the following grant data from the provided web content. "
191
- "- Grant name/title\n"
192
- "- Short summary\n"
193
- "- Funding organization\n"
194
- "- Grant value (numeric only)\n"
195
- "- Application deadline\n"
196
- "- Eligible countries\n"
197
- "- Sector/field\n"
198
- "- Eligibility criteria\n"
199
- "Return in JSON format.\n\n"
200
- f"Web content: {page_content}"
201
- )
202
-
203
- client = genai.Client(api_key=GOOGLE_API_KEY)
204
- new_answer = client.models.generate_content(
205
- model="models/gemini-2.0-flash-lite",
206
- contents=f"{full_prompt}, return the json string and nothing else"
207
- )
208
-
209
- response = new_answer.text
210
-
211
- # Extract JSON output from Gemini
212
- try:
213
- start_index = response.find('[')
214
- end_index = response.rfind(']') + 1
215
- json_string = response[start_index:end_index]
216
- result = json.loads(json_string)
217
- except Exception as parse_error:
218
- print(f"Error parsing JSON from Gemini model response. Response: {response}")
219
- return {}
220
 
221
- # Ensure JSON is wrapped correctly
222
- if isinstance(result, list):
223
- result = {"grants": result}
 
 
224
 
225
- if not result.get("grants"):
226
- print("No grant opportunities found in the scraped URL.")
227
  return {}
228
 
229
- print(f"First grant opportunity: {result['grants'][0]}")
230
- return result
231
-
232
 
233
  @app.route("/scrape_url", methods=["POST"])
234
  def scrape_url():
@@ -250,4 +370,4 @@ def scrape_url():
250
 
251
 
252
  if __name__ == "__main__":
253
- app.run(debug=True, host="0.0.0.0", port=7860)
 
2
  import os
3
  import json
4
  import time
 
 
 
 
5
  from flask_cors import CORS
6
  from google import genai
7
  from google.genai import types
8
+ from exa_py import Exa
9
+ from linkup import LinkupClient
 
 
 
10
 
11
  app = Flask(__name__)
12
  CORS(app)
 
16
  if not GOOGLE_API_KEY:
17
  raise ValueError("GOOGLE_API_KEY environment variable is not set.")
18
 
19
+ EXA_API_KEY = os.environ.get("EXA_API_KEY")
20
+ if not EXA_API_KEY:
21
+ raise ValueError("EXA_API_KEY environment variable is not set.")
22
 
23
+ LINKUP_API_KEY = os.environ.get("LINKUP_API_KEY")
24
+ if not LINKUP_API_KEY:
25
+ raise ValueError("LINKUP_API_KEY environment variable is not set.")
26
 
27
+ # Initialize clients
28
+ exa = Exa(api_key=EXA_API_KEY)
29
+ linkup_client = LinkupClient(api_key=LINKUP_API_KEY)
 
 
 
 
 
 
30
 
31
 
32
  def get_data(search_term):
33
  """
34
+ Run the Linkup deep search for a given search term.
35
+ If a rate-limit error occurs, wait 10 seconds and retry.
36
  """
37
+ full_query = f"{search_term} grants funding opportunities"
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  print("\n=== DEBUG: Start get_data() ===")
40
  print(f"Search Term: {search_term}")
41
+ print(f"Full Query: {full_query}\n")
42
 
43
  try:
44
+ response = linkup_client.search(
45
+ query=full_query,
46
+ depth="deep",
47
+ output_type="sourcedAnswer",
48
+ include_images=False,
49
+ )
50
+
51
+ print("\n=== DEBUG: Raw result from linkup search ===")
52
+ print(response)
53
  print("===========================================")
54
 
55
+ # Extract the answer content from Linkup response
56
+ content = ""
57
+ if hasattr(response, 'answer'):
58
+ content = response.answer
59
+ elif isinstance(response, dict) and 'answer' in response:
60
+ content = response['answer']
61
+ else:
62
+ content = str(response)
63
+
64
+ # Process the content with Gemini AI to extract structured grant data
65
+ structured_prompt = (
66
+ f"Based on the following search results about {search_term} grants, "
67
+ "extract and structure grant information with:\n"
68
+ "- Grant name/title\n"
69
+ "- Short summary \n"
70
+ "- Funding organization\n"
71
+ "- Grant value (numeric only)\n"
72
+ "- Application deadline\n"
73
+ "- Eligible countries\n"
74
+ "- Sector/field\n"
75
+ "- Eligibility criteria\n"
76
+ "- link URL\n"
77
+ "Return in JSON format with a 'grants' array.\n\n"
78
+ f"Search results: {content}"
79
+ )
80
+
81
+ client = genai.Client(api_key=GOOGLE_API_KEY)
82
+ gemini_response = client.models.generate_content(
83
+ model="models/gemini-2.0-flash-lite",
84
+ contents=f"{structured_prompt}, return the json string and nothing else"
85
+ )
86
+
87
+ gemini_text = gemini_response.text
88
+ print(f"DEBUG: Gemini response: {gemini_text}")
89
+
90
+ # Parse JSON from Gemini response
91
+ try:
92
+ # Try to find JSON in the response
93
+ start_index = gemini_text.find('{')
94
+ if start_index == -1:
95
+ start_index = gemini_text.find('[')
96
+
97
+ if start_index != -1:
98
+ if gemini_text[start_index] == '{':
99
+ end_index = gemini_text.rfind('}') + 1
100
+ else:
101
+ end_index = gemini_text.rfind(']') + 1
102
+
103
+ json_string = gemini_text[start_index:end_index]
104
+ result = json.loads(json_string)
105
+
106
+ # Ensure result has grants array
107
+ if isinstance(result, list):
108
+ result = {"grants": result}
109
+ elif isinstance(result, dict) and "grants" not in result:
110
+ # If it's a dict but no grants key, assume it's a single grant
111
+ result = {"grants": [result]}
112
+ else:
113
+ result = {"grants": []}
114
+ except json.JSONDecodeError as je:
115
+ print(f"ERROR: Failed to parse JSON from Gemini response: {je}")
116
+ result = {"grants": []}
117
 
118
  if not result or "grants" not in result or not result["grants"]:
119
  print(f"DEBUG: No grants found for '{search_term}'.")
 
126
  err_str = str(e)
127
  print(f"ERROR: Exception occurred - {err_str}")
128
 
129
+ # Check for rate limiting or similar errors
130
+ if "rate" in err_str.lower() or "limit" in err_str.lower():
131
+ print("DEBUG: Rate limit detected. Retrying in 10 seconds...")
132
  time.sleep(10)
133
  try:
134
+ response = linkup_client.search(
135
+ query=full_query,
136
+ depth="deep",
137
+ output_type="sourcedAnswer",
138
+ include_images=False,
139
+ )
140
+
141
+ # Process retry response similar to above
142
+ content = ""
143
+ if hasattr(response, 'answer'):
144
+ content = response.answer
145
+ elif isinstance(response, dict) and 'answer' in response:
146
+ content = response['answer']
147
+ else:
148
+ content = str(response)
149
+
150
+ structured_prompt = (
151
+ f"Based on the following search results about {search_term} grants, "
152
+ "extract and structure grant information with:\n"
153
+ "- Grant name/title\n"
154
+ "- Short summary \n"
155
+ "- Funding organization\n"
156
+ "- Grant value (numeric only)\n"
157
+ "- Application deadline\n"
158
+ "- Eligible countries\n"
159
+ "- Sector/field\n"
160
+ "- Eligibility criteria\n"
161
+ "- link URL\n"
162
+ "Return in JSON format with a 'grants' array.\n\n"
163
+ f"Search results: {content}"
164
+ )
165
+
166
+ client = genai.Client(api_key=GOOGLE_API_KEY)
167
+ gemini_response = client.models.generate_content(
168
+ model="models/gemini-2.0-flash-lite",
169
+ contents=f"{structured_prompt}, return the json string and nothing else"
170
+ )
171
+
172
+ gemini_text = gemini_response.text
173
+
174
+ try:
175
+ start_index = gemini_text.find('{')
176
+ if start_index == -1:
177
+ start_index = gemini_text.find('[')
178
+
179
+ if start_index != -1:
180
+ if gemini_text[start_index] == '{':
181
+ end_index = gemini_text.rfind('}') + 1
182
+ else:
183
+ end_index = gemini_text.rfind(']') + 1
184
+
185
+ json_string = gemini_text[start_index:end_index]
186
+ result = json.loads(json_string)
187
+
188
+ if isinstance(result, list):
189
+ result = {"grants": result}
190
+ elif isinstance(result, dict) and "grants" not in result:
191
+ result = {"grants": [result]}
192
+ else:
193
+ result = {"grants": []}
194
+ except json.JSONDecodeError:
195
+ result = {"grants": []}
196
+
197
  if not result or "grants" not in result or not result["grants"]:
198
  print(f"DEBUG: No grants found after retry for '{search_term}'.")
199
  return {"error": f"No results returned for '{search_term}' after retry. Please try again with a different search term."}
200
+
201
  print("DEBUG: Grants found on retry, returning results.")
202
  return result
203
+
204
  except Exception as e2:
205
  print(f"ERROR: Retry failed - {str(e2)}")
206
  return {"error": f"Retry failed for '{search_term}': {str(e2)}. Please try again later."}
 
250
 
251
  def get_data_from_url(url):
252
  """
253
+ Scrape the provided URL using Exa API.
254
  Extract grant data using Gemini AI.
255
  """
256
+ print(f"\n=== DEBUG: Start get_data_from_url() ===")
257
+ print(f"URL: {url}")
258
 
 
259
  try:
260
+ # Use Exa to get content from URL
261
+ result = exa.get_contents(
262
+ [url],
263
+ text=True
264
+ )
265
+
266
+ print("\n=== DEBUG: Raw result from Exa ===")
267
+ print(result)
268
+ print("=====================================")
269
+
270
+ # Extract text content from Exa response
271
+ page_content = ""
272
+ if hasattr(result, 'results') and result.results:
273
+ page_content = result.results[0].text if hasattr(result.results[0], 'text') else str(result.results[0])
274
+ elif isinstance(result, dict) and 'results' in result and result['results']:
275
+ page_content = result['results'][0].get('text', str(result['results'][0]))
276
  else:
277
+ page_content = str(result)
278
 
279
+ if not page_content:
280
+ print("ERROR: No content extracted from URL")
281
+ return {}
282
+
283
+ print(f"DEBUG: Extracted content length: {len(page_content)}")
284
+
285
+ # Process content with Gemini AI
286
+ full_prompt = (
287
+ "Extract the following grant data from the provided web content. "
288
+ "- Grant name/title\n"
289
+ "- Short summary\n"
290
+ "- Funding organization\n"
291
+ "- Grant value (numeric only)\n"
292
+ "- Application deadline\n"
293
+ "- Eligible countries\n"
294
+ "- Sector/field\n"
295
+ "- Eligibility criteria\n"
296
+ "Return in JSON format with a 'grants' array.\n\n"
297
+ f"Web content: {page_content[:10000]}" # Limit content to avoid token limits
298
+ )
299
+
300
+ client = genai.Client(api_key=GOOGLE_API_KEY)
301
+ gemini_response = client.models.generate_content(
302
+ model="models/gemini-2.0-flash-lite",
303
+ contents=f"{full_prompt}, return the json string and nothing else"
304
+ )
305
+
306
+ response_text = gemini_response.text
307
+ print(f"DEBUG: Gemini response: {response_text}")
308
+
309
+ # Extract JSON output from Gemini
310
  try:
311
+ start_index = response_text.find('[')
312
+ if start_index == -1:
313
+ start_index = response_text.find('{')
314
+
315
+ if start_index != -1:
316
+ if response_text[start_index] == '[':
317
+ end_index = response_text.rfind(']') + 1
318
+ else:
319
+ end_index = response_text.rfind('}') + 1
320
+
321
+ json_string = response_text[start_index:end_index]
322
+ parsed_result = json.loads(json_string)
323
+
324
+ # Ensure JSON is wrapped correctly
325
+ if isinstance(parsed_result, list):
326
+ parsed_result = {"grants": parsed_result}
327
+ elif isinstance(parsed_result, dict) and "grants" not in parsed_result:
328
+ # If it's a dict but no grants key, assume it's a single grant
329
+ parsed_result = {"grants": [parsed_result]}
330
  else:
331
+ parsed_result = {"grants": []}
332
+
333
+ except Exception as parse_error:
334
+ print(f"Error parsing JSON from Gemini model response: {parse_error}")
335
+ print(f"Response: {response_text}")
336
  return {}
337
 
338
+ if not parsed_result.get("grants"):
339
+ print("No grant opportunities found in the scraped URL.")
340
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
+ print(f"DEBUG: Found {len(parsed_result['grants'])} grants")
343
+ if parsed_result['grants']:
344
+ print(f"First grant opportunity: {parsed_result['grants'][0]}")
345
+
346
+ return parsed_result
347
 
348
+ except Exception as e:
349
+ print(f"ERROR: Exception in get_data_from_url: {str(e)}")
350
  return {}
351
 
 
 
 
352
 
353
  @app.route("/scrape_url", methods=["POST"])
354
  def scrape_url():
 
370
 
371
 
372
  if __name__ == "__main__":
373
+ app.run(debug=True, host="0.0.0.0", port=7860)