Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1103,6 +1103,91 @@ def get_random_header():
|
|
| 1103 |
]
|
| 1104 |
return random.choice(headers)
|
| 1105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1106 |
def clean_content(raw_content: str) -> str:
|
| 1107 |
# Parse HTML using BeautifulSoup (if not HTML, it will safely return the text)
|
| 1108 |
soup = BeautifulSoup(raw_content, "html.parser")
|
|
@@ -1227,12 +1312,7 @@ def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: floa
|
|
| 1227 |
snippet = summarize_large_text(snippet, target_length=2000, chunk_size=1000, overlap=200)
|
| 1228 |
snippet_words = len(snippet.split())
|
| 1229 |
|
| 1230 |
-
# Decide a proportional dynamic token count (for reference; not used to limit the API call below)
|
| 1231 |
-
dynamic_tokens = min(3000, max(250, int(snippet_words * 0.5)))
|
| 1232 |
-
|
| 1233 |
client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
|
| 1234 |
-
# (Assuming you use a client instance from your OpenAI library elsewhere.)
|
| 1235 |
-
# Here, we assume that openai.OpenAI(api_key=...) is wrapped by openai_call.
|
| 1236 |
|
| 1237 |
prompt = (f"""Analyze the following content from a query result:
|
| 1238 |
|
|
@@ -1248,7 +1328,7 @@ Instructions:
|
|
| 1248 |
- Key Facts (at least 5): List the core factual claims using short, declarative sentences or bullet points. Apply lemmatization and standard abbreviations.
|
| 1249 |
- Key Figures (at least 5): Extract numerical data (statistics, dates, percentages) and include any necessary context (units, references, explanations) required to interpret these numbers. Present them concisely (list or table format).
|
| 1250 |
- Key Arguments (at least 5): Identify main arguments or claims. Summarize supporting evidence and counter-arguments concisely.
|
| 1251 |
-
- Key Quotes (at least 1 if any): Include significant quotes (with the author's name in parentheses). Attribute quotes correctly. Paraphrase if needed, indicating that it
|
| 1252 |
- Structured Summary (10 to 50 sentences): Provide a structured summary that includes anecdotes, people, and locations to ensure the report is relatable.
|
| 1253 |
|
| 1254 |
Note: General Optimization Guidelines:
|
|
@@ -1258,40 +1338,88 @@ Note: General Optimization Guidelines:
|
|
| 1258 |
- Shorten words carefully (e.g., "information" -> "info") without causing ambiguity.
|
| 1259 |
- Use symbols where appropriate.
|
| 1260 |
|
| 1261 |
-
3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries relevant to the research topic and the summarized content. Use search operators (AND, OR, quotation marks) as needed.
|
| 1262 |
|
| 1263 |
4. Ensure that the summary length and level of detail is proportional to the source length.
|
| 1264 |
Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
|
| 1265 |
|
| 1266 |
-
|
| 1267 |
-
|
| 1268 |
-
|
|
|
|
|
|
|
| 1269 |
)
|
| 1270 |
|
| 1271 |
try:
|
| 1272 |
response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
|
| 1273 |
-
|
| 1274 |
-
if not res_text:
|
| 1275 |
logging.error("analyze_with_gpt4o: Empty response received from API.")
|
| 1276 |
return {"relevant": "no", "summary": "", "followups": []}
|
| 1277 |
-
|
| 1278 |
-
if
|
| 1279 |
-
|
| 1280 |
-
|
| 1281 |
-
|
| 1282 |
-
|
| 1283 |
-
|
| 1284 |
-
|
| 1285 |
-
|
| 1286 |
-
|
| 1287 |
-
|
| 1288 |
-
|
| 1289 |
-
|
| 1290 |
-
|
| 1291 |
-
|
| 1292 |
-
|
| 1293 |
-
|
| 1294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1295 |
except Exception as e:
|
| 1296 |
logging.error(f"analyze_with_gpt4o error: {e}")
|
| 1297 |
return {"relevant": "no", "summary": "", "followups": []}
|
|
@@ -2258,26 +2386,19 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 2258 |
title = res.get("title", "No Title")
|
| 2259 |
if not url:
|
| 2260 |
continue
|
| 2261 |
-
|
| 2262 |
-
|
| 2263 |
-
|
| 2264 |
-
|
| 2265 |
-
|
| 2266 |
-
|
| 2267 |
-
headers = {"User-Agent": get_random_header()}
|
| 2268 |
-
response = requests.get(url, headers=headers)
|
| 2269 |
-
response.raise_for_status()
|
| 2270 |
-
raw_content = response.text
|
| 2271 |
-
process_log += f"Extracted full page content from {url}\n"
|
| 2272 |
-
except Exception as e:
|
| 2273 |
-
logging.error(f"Error retrieving content from {url}: {e}")
|
| 2274 |
-
process_log += f"Error retrieving content from {url}: {e}\n"
|
| 2275 |
-
continue
|
| 2276 |
# Skip processing if raw_content is empty or too short (< 1000 characters)
|
| 2277 |
-
if not raw_content or len(raw_content) < 1000 or "could not be extracted" in raw_content.lower()
|
| 2278 |
process_log += f"Content from {url} is either an error or too short (<1000 characters), skipping.\n"
|
| 2279 |
continue
|
| 2280 |
|
|
|
|
|
|
|
| 2281 |
# 1) Clean and do minimal parse
|
| 2282 |
cleaned_html = clean_content(raw_content)
|
| 2283 |
# 2) Extract structured data
|
|
|
|
| 1103 |
]
|
| 1104 |
return random.choice(headers)
|
| 1105 |
|
| 1106 |
+
def process_url(url: str, retries: int = 3, timeout: int = 15) -> str:
|
| 1107 |
+
"""
|
| 1108 |
+
Process a URL with multiple user agents and retries to handle 403 errors
|
| 1109 |
+
and other common web scraping issues.
|
| 1110 |
+
|
| 1111 |
+
Args:
|
| 1112 |
+
url: The URL to retrieve content from
|
| 1113 |
+
retries: Number of retry attempts (with different user agents)
|
| 1114 |
+
timeout: Connection timeout in seconds
|
| 1115 |
+
|
| 1116 |
+
Returns:
|
| 1117 |
+
The page content as a string, or an error message
|
| 1118 |
+
"""
|
| 1119 |
+
if url.lower().endswith(".pdf"):
|
| 1120 |
+
return process_pdf(url)
|
| 1121 |
+
|
| 1122 |
+
user_agents = [
|
| 1123 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
| 1124 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15",
|
| 1125 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
|
| 1126 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
|
| 1127 |
+
"Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1"
|
| 1128 |
+
]
|
| 1129 |
+
|
| 1130 |
+
referrers = [
|
| 1131 |
+
"https://www.google.com/",
|
| 1132 |
+
"https://www.bing.com/",
|
| 1133 |
+
"https://search.yahoo.com/",
|
| 1134 |
+
"https://duckduckgo.com/",
|
| 1135 |
+
"https://www.baidu.com/"
|
| 1136 |
+
]
|
| 1137 |
+
|
| 1138 |
+
for attempt in range(retries):
|
| 1139 |
+
try:
|
| 1140 |
+
# Choose a different user agent and referrer for each attempt
|
| 1141 |
+
headers = {
|
| 1142 |
+
"User-Agent": user_agents[attempt % len(user_agents)],
|
| 1143 |
+
"Referer": referrers[attempt % len(referrers)],
|
| 1144 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
| 1145 |
+
"Accept-Language": "en-US,en;q=0.5",
|
| 1146 |
+
"Accept-Encoding": "gzip, deflate, br",
|
| 1147 |
+
"Connection": "keep-alive",
|
| 1148 |
+
"Upgrade-Insecure-Requests": "1",
|
| 1149 |
+
"Cache-Control": "max-age=0"
|
| 1150 |
+
}
|
| 1151 |
+
|
| 1152 |
+
# Add a delay between attempts that increases with each retry
|
| 1153 |
+
if attempt > 0:
|
| 1154 |
+
delay = random.uniform(2, 5) * attempt
|
| 1155 |
+
logging.info(f"Retry {attempt+1}/{retries} for {url} - waiting {delay:.1f} seconds")
|
| 1156 |
+
time.sleep(delay)
|
| 1157 |
+
|
| 1158 |
+
response = requests.get(url, headers=headers, timeout=timeout)
|
| 1159 |
+
|
| 1160 |
+
if response.status_code == 200:
|
| 1161 |
+
logging.info(f"Successfully retrieved content from {url} on attempt {attempt+1}")
|
| 1162 |
+
return response.text
|
| 1163 |
+
elif response.status_code == 403:
|
| 1164 |
+
logging.warning(f"403 Forbidden on attempt {attempt+1} for {url}, trying different user agent")
|
| 1165 |
+
continue
|
| 1166 |
+
elif response.status_code == 404:
|
| 1167 |
+
return f"Error: Page not found (404) for {url}"
|
| 1168 |
+
else:
|
| 1169 |
+
response.raise_for_status()
|
| 1170 |
+
|
| 1171 |
+
except requests.exceptions.Timeout:
|
| 1172 |
+
logging.warning(f"Timeout on attempt {attempt+1} for {url}")
|
| 1173 |
+
if attempt == retries - 1:
|
| 1174 |
+
return f"Error: Timeout after {retries} attempts for {url}"
|
| 1175 |
+
|
| 1176 |
+
except requests.exceptions.TooManyRedirects:
|
| 1177 |
+
return f"Error: Too many redirects for {url}"
|
| 1178 |
+
|
| 1179 |
+
except requests.exceptions.ConnectionError:
|
| 1180 |
+
logging.warning(f"Connection error on attempt {attempt+1} for {url}")
|
| 1181 |
+
if attempt == retries - 1:
|
| 1182 |
+
return f"Error: Could not connect to {url} after {retries} attempts"
|
| 1183 |
+
|
| 1184 |
+
except Exception as e:
|
| 1185 |
+
logging.error(f"Error retrieving content from {url} (attempt {attempt+1}): {e}")
|
| 1186 |
+
if attempt == retries - 1:
|
| 1187 |
+
return f"Error accessing URL: {str(e)}"
|
| 1188 |
+
|
| 1189 |
+
return f"Content could not be retrieved from {url} after {retries} attempts"
|
| 1190 |
+
|
| 1191 |
def clean_content(raw_content: str) -> str:
|
| 1192 |
# Parse HTML using BeautifulSoup (if not HTML, it will safely return the text)
|
| 1193 |
soup = BeautifulSoup(raw_content, "html.parser")
|
|
|
|
| 1312 |
snippet = summarize_large_text(snippet, target_length=2000, chunk_size=1000, overlap=200)
|
| 1313 |
snippet_words = len(snippet.split())
|
| 1314 |
|
|
|
|
|
|
|
|
|
|
| 1315 |
client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
|
|
|
|
|
|
|
| 1316 |
|
| 1317 |
prompt = (f"""Analyze the following content from a query result:
|
| 1318 |
|
|
|
|
| 1328 |
- Key Facts (at least 5): List the core factual claims using short, declarative sentences or bullet points. Apply lemmatization and standard abbreviations.
|
| 1329 |
- Key Figures (at least 5): Extract numerical data (statistics, dates, percentages) and include any necessary context (units, references, explanations) required to interpret these numbers. Present them concisely (list or table format).
|
| 1330 |
- Key Arguments (at least 5): Identify main arguments or claims. Summarize supporting evidence and counter-arguments concisely.
|
| 1331 |
+
- Key Quotes (at least 1 if any): Include significant quotes (with the author's name in parentheses). Attribute quotes correctly. Paraphrase if needed, indicating that it's a paraphrase. Use symbols (e.g., &, +, ->, =) to conserve tokens.
|
| 1332 |
- Structured Summary (10 to 50 sentences): Provide a structured summary that includes anecdotes, people, and locations to ensure the report is relatable.
|
| 1333 |
|
| 1334 |
Note: General Optimization Guidelines:
|
|
|
|
| 1338 |
- Shorten words carefully (e.g., "information" -> "info") without causing ambiguity.
|
| 1339 |
- Use symbols where appropriate.
|
| 1340 |
|
| 1341 |
+
3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries relevant to the research topic and the summarized content. Use search operators (AND, OR, quotation marks) as needed.
|
| 1342 |
|
| 1343 |
4. Ensure that the summary length and level of detail is proportional to the source length.
|
| 1344 |
Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
|
| 1345 |
|
| 1346 |
+
IMPORTANT: Format your response as a proper JSON object with these fields:
|
| 1347 |
+
- "relevant": "yes" or "no"
|
| 1348 |
+
- "summary": {...your structured summary with all parts...}
|
| 1349 |
+
- "followups": [array of follow-up queries]
|
| 1350 |
+
"""
|
| 1351 |
)
|
| 1352 |
|
| 1353 |
try:
|
| 1354 |
response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
|
| 1355 |
+
if not response:
|
|
|
|
| 1356 |
logging.error("analyze_with_gpt4o: Empty response received from API.")
|
| 1357 |
return {"relevant": "no", "summary": "", "followups": []}
|
| 1358 |
+
|
| 1359 |
+
# Check if the response already begins with "yes" or "no" (non-JSON format)
|
| 1360 |
+
if response.strip().lower().startswith("yes") or response.strip().lower().startswith("no"):
|
| 1361 |
+
# Handle non-JSON format
|
| 1362 |
+
lines = response.strip().split("\n")
|
| 1363 |
+
relevance = "yes" if lines[0].strip().lower() == "yes" else "no"
|
| 1364 |
+
|
| 1365 |
+
# Extract the follow-up queries (usually in brackets at the end)
|
| 1366 |
+
followups_match = re.search(r'\[(.*?)\]', response, re.DOTALL)
|
| 1367 |
+
followups = []
|
| 1368 |
+
if followups_match:
|
| 1369 |
+
followups_text = followups_match.group(1)
|
| 1370 |
+
# Parse the lines within brackets as comma-separated or quoted items
|
| 1371 |
+
followups = [q.strip().strip('"\'') for q in re.findall(r'"([^"]*)"', followups_text)]
|
| 1372 |
+
if not followups: # Try without quotes
|
| 1373 |
+
followups = [q.strip() for q in followups_text.split(",")]
|
| 1374 |
+
|
| 1375 |
+
# Everything else is the summary
|
| 1376 |
+
summary_text = response
|
| 1377 |
+
if followups_match:
|
| 1378 |
+
summary_text = response[:followups_match.start()].strip()
|
| 1379 |
+
if summary_text.startswith("yes") or summary_text.startswith("no"):
|
| 1380 |
+
summary_text = "\n".join(lines[1:]).strip()
|
| 1381 |
+
|
| 1382 |
+
return {
|
| 1383 |
+
"relevant": relevance,
|
| 1384 |
+
"summary": summary_text,
|
| 1385 |
+
"followups": followups if followups else []
|
| 1386 |
+
}
|
| 1387 |
+
|
| 1388 |
+
else:
|
| 1389 |
+
# Standard JSON parsing
|
| 1390 |
+
# Remove Markdown code fences if present
|
| 1391 |
+
if response.startswith("```"):
|
| 1392 |
+
response = re.sub(r"^```(json)?", "", response)
|
| 1393 |
+
response = re.sub(r"```$", "", response).strip()
|
| 1394 |
+
response = response.strip()
|
| 1395 |
+
# Optionally remove any start/end markers like "json" if present:
|
| 1396 |
+
if response.lower().startswith("json"):
|
| 1397 |
+
response = response[4:].strip()
|
| 1398 |
+
try:
|
| 1399 |
+
result = json.loads(response)
|
| 1400 |
+
return result
|
| 1401 |
+
except json.JSONDecodeError:
|
| 1402 |
+
# If JSON parsing fails, try to extract the information using regex
|
| 1403 |
+
logging.warning("JSON parsing failed, attempting regex extraction")
|
| 1404 |
+
relevance_match = re.search(r'"relevant":\s*"(yes|no)"', response, re.IGNORECASE)
|
| 1405 |
+
relevance = relevance_match.group(1) if relevance_match else "no"
|
| 1406 |
+
|
| 1407 |
+
# Extract follow-up queries using regex
|
| 1408 |
+
followups_match = re.search(r'"followups":\s*\[(.*?)\]', response, re.DOTALL)
|
| 1409 |
+
followups = []
|
| 1410 |
+
if followups_match:
|
| 1411 |
+
followups_text = followups_match.group(1)
|
| 1412 |
+
followups = [q.strip().strip('"\'') for q in re.findall(r'"([^"]*)"', followups_text)]
|
| 1413 |
+
|
| 1414 |
+
# Extract summary (everything else)
|
| 1415 |
+
summary = response
|
| 1416 |
+
|
| 1417 |
+
return {
|
| 1418 |
+
"relevant": relevance,
|
| 1419 |
+
"summary": summary,
|
| 1420 |
+
"followups": followups
|
| 1421 |
+
}
|
| 1422 |
+
|
| 1423 |
except Exception as e:
|
| 1424 |
logging.error(f"analyze_with_gpt4o error: {e}")
|
| 1425 |
return {"relevant": "no", "summary": "", "followups": []}
|
|
|
|
| 2386 |
title = res.get("title", "No Title")
|
| 2387 |
if not url:
|
| 2388 |
continue
|
| 2389 |
+
|
| 2390 |
+
raw_content = process_url(url)
|
| 2391 |
+
if raw_content.startswith("Error"):
|
| 2392 |
+
process_log += f"{raw_content}\n"
|
| 2393 |
+
continue
|
| 2394 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2395 |
# Skip processing if raw_content is empty or too short (< 1000 characters)
|
| 2396 |
+
if not raw_content or len(raw_content) < 1000 or "could not be extracted" in raw_content.lower():
|
| 2397 |
process_log += f"Content from {url} is either an error or too short (<1000 characters), skipping.\n"
|
| 2398 |
continue
|
| 2399 |
|
| 2400 |
+
process_log += f"Successfully extracted content from {url}\n"
|
| 2401 |
+
|
| 2402 |
# 1) Clean and do minimal parse
|
| 2403 |
cleaned_html = clean_content(raw_content)
|
| 2404 |
# 2) Extract structured data
|