Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -89,68 +89,104 @@ def get_data(search_term):
|
|
| 89 |
st.error(f"An error occurred for search term: {search_term}, error: {e}. Please try again.")
|
| 90 |
return {}
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
def get_data_from_url(url):
|
| 93 |
"""
|
| 94 |
-
Scrape the provided URL using Supadata
|
| 95 |
-
|
| 96 |
"""
|
|
|
|
|
|
|
|
|
|
| 97 |
try:
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
try:
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
web_content = supadata.scrape(url)
|
| 107 |
-
else:
|
| 108 |
-
st.error("Fallback method not available: supadata does not have 'scrape'.")
|
| 109 |
-
return {}
|
| 110 |
else:
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
"Extract the following grant data from the provided web content. "
|
| 116 |
-
"- Grant name/title\n"
|
| 117 |
-
"- Short summary\n"
|
| 118 |
-
"- Funding organization\n"
|
| 119 |
-
"- Grant value (numeric only)\n"
|
| 120 |
-
"- Application deadline\n"
|
| 121 |
-
"- Eligible countries\n"
|
| 122 |
-
"- Sector/field\n"
|
| 123 |
-
"- Eligibility criteria\n"
|
| 124 |
-
"Return in JSON format.\n\n"
|
| 125 |
-
f"Web content: {page_content}"
|
| 126 |
-
)
|
| 127 |
-
client = genai.Client(api_key=GOOGLE_API_KEY)
|
| 128 |
-
new_answer = client.models.generate_content(
|
| 129 |
-
model="models/gemini-2.0-flash-lite",
|
| 130 |
-
contents=f"{full_prompt}, return the json string and nothing else"
|
| 131 |
-
)
|
| 132 |
-
response = new_answer.text
|
| 133 |
try:
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
# If result is a list, wrap it in a dictionary with the key "grants"
|
| 143 |
-
if isinstance(result, list):
|
| 144 |
-
result = {"grants": result}
|
| 145 |
-
if not result.get("grants"):
|
| 146 |
-
st.error("No grant opportunities found in the scraped URL.")
|
| 147 |
return {}
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
return {}
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
|
| 156 |
|
|
|
|
| 89 |
st.error(f"An error occurred for search term: {search_term}, error: {e}. Please try again.")
|
| 90 |
return {}
|
| 91 |
|
| 92 |
+
|
| 93 |
+
import requests
|
| 94 |
+
import json
|
| 95 |
+
import streamlit as st
|
| 96 |
+
import google.generativeai as genai
|
| 97 |
+
|
| 98 |
+
SUPADATA_API_KEY = "your_supadata_api_key" # Replace with actual key
|
| 99 |
+
GOOGLE_API_KEY = "your_google_api_key" # Replace with actual key
|
| 100 |
+
|
| 101 |
def get_data_from_url(url):
|
| 102 |
"""
|
| 103 |
+
Scrape the provided URL using Supadata. If it fails, fall back to the Supadata API,
|
| 104 |
+
and if that fails, fall back to a direct request. Extract grant data using Gemini AI.
|
| 105 |
"""
|
| 106 |
+
page_content = None # Placeholder for storing scraped page content
|
| 107 |
+
|
| 108 |
+
# **Step 1: Attempt Supadata's Built-in Scraper**
|
| 109 |
try:
|
| 110 |
+
web_content = supadata.web.scrape(url)
|
| 111 |
+
page_content = web_content.content
|
| 112 |
+
except TypeError as te:
|
| 113 |
+
if "unexpected keyword argument 'type'" in str(te):
|
| 114 |
+
st.warning("Falling back to Supadata API due to unexpected keyword 'type' error.")
|
| 115 |
+
else:
|
| 116 |
+
st.error(f"Unexpected error in Supadata scrape: {te}")
|
| 117 |
+
|
| 118 |
+
# **Step 2: If Supadata's Built-in Scraper Fails, Use Supadata API**
|
| 119 |
+
if not page_content:
|
| 120 |
try:
|
| 121 |
+
api_url = "https://api.supadata.ai/v1/web/scrape"
|
| 122 |
+
headers = {"X-API-Key": SUPADATA_API_KEY}
|
| 123 |
+
response = requests.get(api_url, headers=headers, params={"url": url})
|
| 124 |
+
|
| 125 |
+
if response.status_code == 200:
|
| 126 |
+
page_content = response.json().get("content", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
else:
|
| 128 |
+
st.error(f"Supadata API failed with status {response.status_code}")
|
| 129 |
+
except Exception as e:
|
| 130 |
+
st.error(f"Error calling Supadata API: {e}")
|
| 131 |
|
| 132 |
+
# **Step 3: If Supadata API Fails, Use Direct Web Request**
|
| 133 |
+
if not page_content:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
try:
|
| 135 |
+
r = requests.get(url, timeout=10)
|
| 136 |
+
if r.status_code == 200:
|
| 137 |
+
page_content = r.text
|
| 138 |
+
else:
|
| 139 |
+
st.error(f"Manual scraping failed with status code {r.status_code}")
|
| 140 |
+
return {}
|
| 141 |
+
except Exception as e:
|
| 142 |
+
st.error(f"Manual scraping error: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
return {}
|
| 144 |
+
|
| 145 |
+
# **Pass Content to Gemini AI**
|
| 146 |
+
full_prompt = (
|
| 147 |
+
"Extract the following grant data from the provided web content. "
|
| 148 |
+
"- Grant name/title\n"
|
| 149 |
+
"- Short summary\n"
|
| 150 |
+
"- Funding organization\n"
|
| 151 |
+
"- Grant value (numeric only)\n"
|
| 152 |
+
"- Application deadline\n"
|
| 153 |
+
"- Eligible countries\n"
|
| 154 |
+
"- Sector/field\n"
|
| 155 |
+
"- Eligibility criteria\n"
|
| 156 |
+
"Return in JSON format.\n\n"
|
| 157 |
+
f"Web content: {page_content}"
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
client = genai.Client(api_key=GOOGLE_API_KEY)
|
| 161 |
+
new_answer = client.models.generate_content(
|
| 162 |
+
model="models/gemini-2.0-flash-lite",
|
| 163 |
+
contents=f"{full_prompt}, return the json string and nothing else"
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
response = new_answer.text
|
| 167 |
+
|
| 168 |
+
# **Extract JSON Output from Gemini**
|
| 169 |
+
try:
|
| 170 |
+
start_index = response.find('[')
|
| 171 |
+
end_index = response.rfind(']') + 1
|
| 172 |
+
json_string = response[start_index:end_index]
|
| 173 |
+
result = json.loads(json_string)
|
| 174 |
+
except Exception as parse_error:
|
| 175 |
+
st.error(f"Error parsing JSON from Gemini model response. Response: {response}")
|
| 176 |
return {}
|
| 177 |
|
| 178 |
+
# **Ensure JSON is Wrapped Correctly**
|
| 179 |
+
if isinstance(result, list):
|
| 180 |
+
result = {"grants": result}
|
| 181 |
+
|
| 182 |
+
if not result.get("grants"):
|
| 183 |
+
st.error("No grant opportunities found in the scraped URL.")
|
| 184 |
+
return {}
|
| 185 |
+
|
| 186 |
+
st.success(f"First grant opportunity: {result['grants'][0]}")
|
| 187 |
+
return result
|
| 188 |
+
|
| 189 |
+
|
| 190 |
|
| 191 |
|
| 192 |
|