rairo commited on
Commit
1ae42b1
·
verified ·
1 Parent(s): 8bb8601

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +112 -14
main.py CHANGED
@@ -4,8 +4,12 @@ import json
4
  import time
5
  import subprocess
6
  import nest_asyncio
 
7
  from scrapegraphai.graphs import SearchGraph
8
- from flask_cors import CORS, cross_origin
 
 
 
9
 
10
  # Ensure Playwright installs required browsers and dependencies
11
  subprocess.run(["playwright", "install"])
@@ -14,11 +18,18 @@ nest_asyncio.apply()
14
  app = Flask(__name__)
15
  CORS(app)
16
 
17
- # Set your Google API key as an environment variable.
18
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
19
  if not GOOGLE_API_KEY:
20
  raise ValueError("GOOGLE_API_KEY environment variable is not set.")
21
 
 
 
 
 
 
 
 
22
  graph_config = {
23
  "llm": {
24
  "api_key": GOOGLE_API_KEY,
@@ -34,9 +45,7 @@ def get_data(search_term):
34
  """
35
  Run the SearchGraph for a given search term.
36
  If a rate-limit error (202) occurs, wait 10 seconds and retry.
37
- Includes debugging steps to compare Flask and Streamlit behavior.
38
  """
39
-
40
  full_prompt = (
41
  f"search for {search_term} grants\n\n"
42
  "List me all grants or funds with:\n"
@@ -59,7 +68,6 @@ def get_data(search_term):
59
  try:
60
  search_graph = SearchGraph(prompt=full_prompt, config=graph_config)
61
  result = search_graph.run()
62
-
63
  print("\n=== DEBUG: Raw result from search_graph.run() ===")
64
  print(result)
65
  print("===========================================")
@@ -73,7 +81,6 @@ def get_data(search_term):
73
  print("ERROR: Failed to parse JSON from search result.")
74
  return {"error": "Failed to parse JSON from search result."}
75
 
76
- # Check if grants data exists
77
  if not result or "grants" not in result or not result["grants"]:
78
  print(f"DEBUG: No grants found for '{search_term}'.")
79
  return {"error": f"No results returned for '{search_term}'. Please try again with a different search term."}
@@ -88,26 +95,20 @@ def get_data(search_term):
88
  if "202" in err_str:
89
  print("DEBUG: Rate limit (202) detected. Retrying in 10 seconds...")
90
  time.sleep(10)
91
-
92
  try:
93
  search_graph = SearchGraph(prompt=full_prompt, config=graph_config)
94
  result = search_graph.run()
95
-
96
  print("\n=== DEBUG: Retrying search_graph.run() ===")
97
  print(result)
98
  print("===========================================")
99
-
100
  if not result or "grants" not in result or not result["grants"]:
101
  print(f"DEBUG: No grants found after retry for '{search_term}'.")
102
  return {"error": f"No results returned for '{search_term}' after retry. Please try again with a different search term."}
103
-
104
  print("DEBUG: Grants found on retry, returning results.")
105
  return result
106
-
107
  except Exception as e2:
108
  print(f"ERROR: Retry failed - {str(e2)}")
109
  return {"error": f"Retry failed for '{search_term}': {str(e2)}. Please try again later."}
110
-
111
  else:
112
  return {"error": f"An error occurred for '{search_term}': {str(e)}. Please try again."}
113
 
@@ -127,10 +128,11 @@ def process_multiple_search_terms(search_terms):
127
  all_data["grants"].extend(result["grants"])
128
  return all_data
129
 
 
130
  @app.route("/scrape", methods=["POST"])
131
  def scrape():
132
  """
133
- Endpoint to scrape grant opportunities.
134
  Expects a JSON body with the key 'search_terms' (a string with newline-separated search terms
135
  or a list of strings). Returns JSON with the aggregated results.
136
  """
@@ -139,7 +141,6 @@ def scrape():
139
  return jsonify({"error": "Request must include 'search_terms' key."}), 400
140
 
141
  search_terms = data["search_terms"]
142
- # If search_terms is a string, split it by newlines.
143
  if isinstance(search_terms, str):
144
  search_terms = [s.strip() for s in search_terms.split("\n") if s.strip()]
145
  elif not isinstance(search_terms, list):
@@ -151,5 +152,102 @@ def scrape():
151
  result = process_multiple_search_terms(search_terms)
152
  return jsonify(result), 200
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  if __name__ == "__main__":
155
  app.run(debug=True, host="0.0.0.0", port=7860)
 
4
  import time
5
  import subprocess
6
  import nest_asyncio
7
+ import requests # For API fallback call to Supadata
8
  from scrapegraphai.graphs import SearchGraph
9
+ from flask_cors import CORS
10
+ from google import genai
11
+ from google.genai import types
12
+ from supadata import Supadata, SupadataError
13
 
14
  # Ensure Playwright installs required browsers and dependencies
15
  subprocess.run(["playwright", "install"])
 
18
  app = Flask(__name__)
19
  CORS(app)
20
 
21
+ # Environment variables
22
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
23
  if not GOOGLE_API_KEY:
24
  raise ValueError("GOOGLE_API_KEY environment variable is not set.")
25
 
26
+ SUPADATA_API_KEY = os.environ.get("SUPADATA_API_KEY")
27
+ if not SUPADATA_API_KEY:
28
+ raise ValueError("SUPADATA_API_KEY environment variable is not set.")
29
+
30
+ # Initialize Supadata client
31
+ supadata = Supadata(api_key=SUPADATA_API_KEY)
32
+
33
  graph_config = {
34
  "llm": {
35
  "api_key": GOOGLE_API_KEY,
 
45
  """
46
  Run the SearchGraph for a given search term.
47
  If a rate-limit error (202) occurs, wait 10 seconds and retry.
 
48
  """
 
49
  full_prompt = (
50
  f"search for {search_term} grants\n\n"
51
  "List me all grants or funds with:\n"
 
68
  try:
69
  search_graph = SearchGraph(prompt=full_prompt, config=graph_config)
70
  result = search_graph.run()
 
71
  print("\n=== DEBUG: Raw result from search_graph.run() ===")
72
  print(result)
73
  print("===========================================")
 
81
  print("ERROR: Failed to parse JSON from search result.")
82
  return {"error": "Failed to parse JSON from search result."}
83
 
 
84
  if not result or "grants" not in result or not result["grants"]:
85
  print(f"DEBUG: No grants found for '{search_term}'.")
86
  return {"error": f"No results returned for '{search_term}'. Please try again with a different search term."}
 
95
  if "202" in err_str:
96
  print("DEBUG: Rate limit (202) detected. Retrying in 10 seconds...")
97
  time.sleep(10)
 
98
  try:
99
  search_graph = SearchGraph(prompt=full_prompt, config=graph_config)
100
  result = search_graph.run()
 
101
  print("\n=== DEBUG: Retrying search_graph.run() ===")
102
  print(result)
103
  print("===========================================")
 
104
  if not result or "grants" not in result or not result["grants"]:
105
  print(f"DEBUG: No grants found after retry for '{search_term}'.")
106
  return {"error": f"No results returned for '{search_term}' after retry. Please try again with a different search term."}
 
107
  print("DEBUG: Grants found on retry, returning results.")
108
  return result
 
109
  except Exception as e2:
110
  print(f"ERROR: Retry failed - {str(e2)}")
111
  return {"error": f"Retry failed for '{search_term}': {str(e2)}. Please try again later."}
 
112
  else:
113
  return {"error": f"An error occurred for '{search_term}': {str(e)}. Please try again."}
114
 
 
128
  all_data["grants"].extend(result["grants"])
129
  return all_data
130
 
131
+
132
  @app.route("/scrape", methods=["POST"])
133
  def scrape():
134
  """
135
+ Endpoint to scrape grant opportunities using search terms.
136
  Expects a JSON body with the key 'search_terms' (a string with newline-separated search terms
137
  or a list of strings). Returns JSON with the aggregated results.
138
  """
 
141
  return jsonify({"error": "Request must include 'search_terms' key."}), 400
142
 
143
  search_terms = data["search_terms"]
 
144
  if isinstance(search_terms, str):
145
  search_terms = [s.strip() for s in search_terms.split("\n") if s.strip()]
146
  elif not isinstance(search_terms, list):
 
152
  result = process_multiple_search_terms(search_terms)
153
  return jsonify(result), 200
154
 
155
+
156
+ def get_data_from_url(url):
157
+ """
158
+ Scrape the provided URL using Supadata. If it fails, fall back to the Supadata API.
159
+ Extract grant data using Gemini AI.
160
+ """
161
+ page_content = None # Placeholder for storing scraped page content
162
+
163
+ # Step 1: Attempt Supadata's built-in scraper
164
+ try:
165
+ web_content = supadata.web.scrape(url)
166
+ page_content = web_content.content
167
+ except TypeError as te:
168
+ if "unexpected keyword argument 'type'" in str(te):
169
+ print("Falling back to Supadata API due to unexpected keyword 'type' error.")
170
+ else:
171
+ print(f"Unexpected error in Supadata scrape: {te}")
172
+
173
+ # Step 2: If Supadata's built-in scraper fails, use Supadata API
174
+ if not page_content:
175
+ try:
176
+ api_url = "https://api.supadata.ai/v1/web/scrape"
177
+ headers = {"X-API-Key": SUPADATA_API_KEY}
178
+ response = requests.get(api_url, headers=headers, params={"url": url})
179
+ if response.status_code == 200:
180
+ page_content = response.json().get("content", "")
181
+ else:
182
+ print(f"Supadata API failed with status {response.status_code}")
183
+ return {}
184
+ except Exception as e:
185
+ print(f"Error calling Supadata API: {e}")
186
+ return {}
187
+
188
+ # Pass content to Gemini AI
189
+ full_prompt = (
190
+ "Extract the following grant data from the provided web content. "
191
+ "- Grant name/title\n"
192
+ "- Short summary\n"
193
+ "- Funding organization\n"
194
+ "- Grant value (numeric only)\n"
195
+ "- Application deadline\n"
196
+ "- Eligible countries\n"
197
+ "- Sector/field\n"
198
+ "- Eligibility criteria\n"
199
+ "Return in JSON format.\n\n"
200
+ f"Web content: {page_content}"
201
+ )
202
+
203
+ client = genai.Client(api_key=GOOGLE_API_KEY)
204
+ new_answer = client.models.generate_content(
205
+ model="models/gemini-2.0-flash-lite",
206
+ contents=f"{full_prompt}, return the json string and nothing else"
207
+ )
208
+
209
+ response = new_answer.text
210
+
211
+ # Extract JSON output from Gemini
212
+ try:
213
+ start_index = response.find('[')
214
+ end_index = response.rfind(']') + 1
215
+ json_string = response[start_index:end_index]
216
+ result = json.loads(json_string)
217
+ except Exception as parse_error:
218
+ print(f"Error parsing JSON from Gemini model response. Response: {response}")
219
+ return {}
220
+
221
+ # Ensure JSON is wrapped correctly
222
+ if isinstance(result, list):
223
+ result = {"grants": result}
224
+
225
+ if not result.get("grants"):
226
+ print("No grant opportunities found in the scraped URL.")
227
+ return {}
228
+
229
+ print(f"First grant opportunity: {result['grants'][0]}")
230
+ return result
231
+
232
+
233
+ @app.route("/scrape_url", methods=["POST"])
234
+ def scrape_url():
235
+ """
236
+ Endpoint to scrape a provided URL for grant opportunities.
237
+ Expects a JSON body with the key 'url'.
238
+ Returns the scraped and processed grant data in JSON format.
239
+ """
240
+ data = request.get_json()
241
+ if not data or "url" not in data:
242
+ return jsonify({"error": "Request must include 'url' key."}), 400
243
+
244
+ url = data["url"]
245
+ result = get_data_from_url(url)
246
+ if not result:
247
+ return jsonify({"error": "Failed to scrape URL or no grants found."}), 500
248
+
249
+ return jsonify(result), 200
250
+
251
+
252
  if __name__ == "__main__":
253
  app.run(debug=True, host="0.0.0.0", port=7860)