Guiyom commited on
Commit
cf84868
·
verified ·
1 Parent(s): 073e11f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -46
app.py CHANGED
@@ -1103,6 +1103,91 @@ def get_random_header():
1103
  ]
1104
  return random.choice(headers)
1105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1106
  def clean_content(raw_content: str) -> str:
1107
  # Parse HTML using BeautifulSoup (if not HTML, it will safely return the text)
1108
  soup = BeautifulSoup(raw_content, "html.parser")
@@ -1227,12 +1312,7 @@ def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: floa
1227
  snippet = summarize_large_text(snippet, target_length=2000, chunk_size=1000, overlap=200)
1228
  snippet_words = len(snippet.split())
1229
 
1230
- # Decide a proportional dynamic token count (for reference; not used to limit the API call below)
1231
- dynamic_tokens = min(3000, max(250, int(snippet_words * 0.5)))
1232
-
1233
  client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
1234
- # (Assuming you use a client instance from your OpenAI library elsewhere.)
1235
- # Here, we assume that openai.OpenAI(api_key=...) is wrapped by openai_call.
1236
 
1237
  prompt = (f"""Analyze the following content from a query result:
1238
 
@@ -1248,7 +1328,7 @@ Instructions:
1248
  - Key Facts (at least 5): List the core factual claims using short, declarative sentences or bullet points. Apply lemmatization and standard abbreviations.
1249
  - Key Figures (at least 5): Extract numerical data (statistics, dates, percentages) and include any necessary context (units, references, explanations) required to interpret these numbers. Present them concisely (list or table format).
1250
  - Key Arguments (at least 5): Identify main arguments or claims. Summarize supporting evidence and counter-arguments concisely.
1251
- - Key Quotes (at least 1 if any): Include significant quotes (with the author's name in parentheses). Attribute quotes correctly. Paraphrase if needed, indicating that its a paraphrase. Use symbols (e.g., &, +, ->, =) to conserve tokens.
1252
  - Structured Summary (10 to 50 sentences): Provide a structured summary that includes anecdotes, people, and locations to ensure the report is relatable.
1253
 
1254
  Note: General Optimization Guidelines:
@@ -1258,40 +1338,88 @@ Note: General Optimization Guidelines:
1258
  - Shorten words carefully (e.g., "information" -> "info") without causing ambiguity.
1259
  - Use symbols where appropriate.
1260
 
1261
- 3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries relevant to the research topic and the summarized content. Use search operators (AND, OR, quotation marks) as needed. Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (do not include the word "python" anywhere).
1262
 
1263
  4. Ensure that the summary length and level of detail is proportional to the source length.
1264
  Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
1265
 
1266
- **Output Requirement: Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (No mention of the coding language ex:"python" or "html" anywhere before the result).
1267
-
1268
- Proceed."""
 
 
1269
  )
1270
 
1271
  try:
1272
  response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
1273
- res_text = response.strip()
1274
- if not res_text:
1275
  logging.error("analyze_with_gpt4o: Empty response received from API.")
1276
  return {"relevant": "no", "summary": "", "followups": []}
1277
- # Remove Markdown code fences if present
1278
- if res_text.startswith("```"):
1279
- res_text = re.sub(r"^```(json)?", "", res_text)
1280
- res_text = re.sub(r"```$", "", res_text).strip()
1281
- res_text = res_text.strip().strip("```").strip()
1282
- # Optionally remove any start/end markers like "json" if present:
1283
- if res_text.lower().startswith("json"):
1284
- res_text = res_text[4:].strip()
1285
- try:
1286
- result = json.loads(res_text)
1287
- except json.JSONDecodeError as je:
1288
- logging.error(f"analyze_with_gpt4o: JSON decode error: {je}. Raw response: '{res_text}'")
1289
- return {"relevant": "no", "summary": "", "followups": []}
1290
- except json.JSONDecodeError as je:
1291
- logging.error(f"analyze_with_gpt4o: JSON decode error: {je}. Raw response: '{res_text}'")
1292
- return {"relevant": "no", "summary": "", "followups": []}
1293
- logging.info(f"analyze_with_gpt4o: snippet analysis result: {result}")
1294
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1295
  except Exception as e:
1296
  logging.error(f"analyze_with_gpt4o error: {e}")
1297
  return {"relevant": "no", "summary": "", "followups": []}
@@ -2258,26 +2386,19 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
2258
  title = res.get("title", "No Title")
2259
  if not url:
2260
  continue
2261
- if url.lower().endswith(".pdf"):
2262
- raw_content = process_pdf(url)
2263
- if "Error processing PDF" in raw_content:
2264
- continue
2265
- else:
2266
- try:
2267
- headers = {"User-Agent": get_random_header()}
2268
- response = requests.get(url, headers=headers)
2269
- response.raise_for_status()
2270
- raw_content = response.text
2271
- process_log += f"Extracted full page content from {url}\n"
2272
- except Exception as e:
2273
- logging.error(f"Error retrieving content from {url}: {e}")
2274
- process_log += f"Error retrieving content from {url}: {e}\n"
2275
- continue
2276
  # Skip processing if raw_content is empty or too short (< 1000 characters)
2277
- if not raw_content or len(raw_content) < 1000 or "could not be extracted" in raw_content.lower() or "error" in raw_content.lower():
2278
  process_log += f"Content from {url} is either an error or too short (<1000 characters), skipping.\n"
2279
  continue
2280
 
 
 
2281
  # 1) Clean and do minimal parse
2282
  cleaned_html = clean_content(raw_content)
2283
  # 2) Extract structured data
 
1103
  ]
1104
  return random.choice(headers)
1105
 
1106
+ def process_url(url: str, retries: int = 3, timeout: int = 15) -> str:
1107
+ """
1108
+ Process a URL with multiple user agents and retries to handle 403 errors
1109
+ and other common web scraping issues.
1110
+
1111
+ Args:
1112
+ url: The URL to retrieve content from
1113
+ retries: Number of retry attempts (with different user agents)
1114
+ timeout: Connection timeout in seconds
1115
+
1116
+ Returns:
1117
+ The page content as a string, or an error message
1118
+ """
1119
+ if url.lower().endswith(".pdf"):
1120
+ return process_pdf(url)
1121
+
1122
+ user_agents = [
1123
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
1124
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15",
1125
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
1126
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
1127
+ "Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1"
1128
+ ]
1129
+
1130
+ referrers = [
1131
+ "https://www.google.com/",
1132
+ "https://www.bing.com/",
1133
+ "https://search.yahoo.com/",
1134
+ "https://duckduckgo.com/",
1135
+ "https://www.baidu.com/"
1136
+ ]
1137
+
1138
+ for attempt in range(retries):
1139
+ try:
1140
+ # Choose a different user agent and referrer for each attempt
1141
+ headers = {
1142
+ "User-Agent": user_agents[attempt % len(user_agents)],
1143
+ "Referer": referrers[attempt % len(referrers)],
1144
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
1145
+ "Accept-Language": "en-US,en;q=0.5",
1146
+ "Accept-Encoding": "gzip, deflate, br",
1147
+ "Connection": "keep-alive",
1148
+ "Upgrade-Insecure-Requests": "1",
1149
+ "Cache-Control": "max-age=0"
1150
+ }
1151
+
1152
+ # Add a delay between attempts that increases with each retry
1153
+ if attempt > 0:
1154
+ delay = random.uniform(2, 5) * attempt
1155
+ logging.info(f"Retry {attempt+1}/{retries} for {url} - waiting {delay:.1f} seconds")
1156
+ time.sleep(delay)
1157
+
1158
+ response = requests.get(url, headers=headers, timeout=timeout)
1159
+
1160
+ if response.status_code == 200:
1161
+ logging.info(f"Successfully retrieved content from {url} on attempt {attempt+1}")
1162
+ return response.text
1163
+ elif response.status_code == 403:
1164
+ logging.warning(f"403 Forbidden on attempt {attempt+1} for {url}, trying different user agent")
1165
+ continue
1166
+ elif response.status_code == 404:
1167
+ return f"Error: Page not found (404) for {url}"
1168
+ else:
1169
+ response.raise_for_status()
1170
+
1171
+ except requests.exceptions.Timeout:
1172
+ logging.warning(f"Timeout on attempt {attempt+1} for {url}")
1173
+ if attempt == retries - 1:
1174
+ return f"Error: Timeout after {retries} attempts for {url}"
1175
+
1176
+ except requests.exceptions.TooManyRedirects:
1177
+ return f"Error: Too many redirects for {url}"
1178
+
1179
+ except requests.exceptions.ConnectionError:
1180
+ logging.warning(f"Connection error on attempt {attempt+1} for {url}")
1181
+ if attempt == retries - 1:
1182
+ return f"Error: Could not connect to {url} after {retries} attempts"
1183
+
1184
+ except Exception as e:
1185
+ logging.error(f"Error retrieving content from {url} (attempt {attempt+1}): {e}")
1186
+ if attempt == retries - 1:
1187
+ return f"Error accessing URL: {str(e)}"
1188
+
1189
+ return f"Content could not be retrieved from {url} after {retries} attempts"
1190
+
1191
  def clean_content(raw_content: str) -> str:
1192
  # Parse HTML using BeautifulSoup (if not HTML, it will safely return the text)
1193
  soup = BeautifulSoup(raw_content, "html.parser")
 
1312
  snippet = summarize_large_text(snippet, target_length=2000, chunk_size=1000, overlap=200)
1313
  snippet_words = len(snippet.split())
1314
 
 
 
 
1315
  client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
 
 
1316
 
1317
  prompt = (f"""Analyze the following content from a query result:
1318
 
 
1328
  - Key Facts (at least 5): List the core factual claims using short, declarative sentences or bullet points. Apply lemmatization and standard abbreviations.
1329
  - Key Figures (at least 5): Extract numerical data (statistics, dates, percentages) and include any necessary context (units, references, explanations) required to interpret these numbers. Present them concisely (list or table format).
1330
  - Key Arguments (at least 5): Identify main arguments or claims. Summarize supporting evidence and counter-arguments concisely.
1331
+ - Key Quotes (at least 1 if any): Include significant quotes (with the author's name in parentheses). Attribute quotes correctly. Paraphrase if needed, indicating that it's a paraphrase. Use symbols (e.g., &, +, ->, =) to conserve tokens.
1332
  - Structured Summary (10 to 50 sentences): Provide a structured summary that includes anecdotes, people, and locations to ensure the report is relatable.
1333
 
1334
  Note: General Optimization Guidelines:
 
1338
  - Shorten words carefully (e.g., "information" -> "info") without causing ambiguity.
1339
  - Use symbols where appropriate.
1340
 
1341
+ 3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries relevant to the research topic and the summarized content. Use search operators (AND, OR, quotation marks) as needed.
1342
 
1343
  4. Ensure that the summary length and level of detail is proportional to the source length.
1344
  Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
1345
 
1346
+ IMPORTANT: Format your response as a proper JSON object with these fields:
1347
+ - "relevant": "yes" or "no"
1348
+ - "summary": {...your structured summary with all parts...}
1349
+ - "followups": [array of follow-up queries]
1350
+ """
1351
  )
1352
 
1353
  try:
1354
  response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
1355
+ if not response:
 
1356
  logging.error("analyze_with_gpt4o: Empty response received from API.")
1357
  return {"relevant": "no", "summary": "", "followups": []}
1358
+
1359
+ # Check if the response already begins with "yes" or "no" (non-JSON format)
1360
+ if response.strip().lower().startswith("yes") or response.strip().lower().startswith("no"):
1361
+ # Handle non-JSON format
1362
+ lines = response.strip().split("\n")
1363
+ relevance = "yes" if lines[0].strip().lower() == "yes" else "no"
1364
+
1365
+ # Extract the follow-up queries (usually in brackets at the end)
1366
+ followups_match = re.search(r'\[(.*?)\]', response, re.DOTALL)
1367
+ followups = []
1368
+ if followups_match:
1369
+ followups_text = followups_match.group(1)
1370
+ # Parse the lines within brackets as comma-separated or quoted items
1371
+ followups = [q.strip().strip('"\'') for q in re.findall(r'"([^"]*)"', followups_text)]
1372
+ if not followups: # Try without quotes
1373
+ followups = [q.strip() for q in followups_text.split(",")]
1374
+
1375
+ # Everything else is the summary
1376
+ summary_text = response
1377
+ if followups_match:
1378
+ summary_text = response[:followups_match.start()].strip()
1379
+ if summary_text.startswith("yes") or summary_text.startswith("no"):
1380
+ summary_text = "\n".join(lines[1:]).strip()
1381
+
1382
+ return {
1383
+ "relevant": relevance,
1384
+ "summary": summary_text,
1385
+ "followups": followups if followups else []
1386
+ }
1387
+
1388
+ else:
1389
+ # Standard JSON parsing
1390
+ # Remove Markdown code fences if present
1391
+ if response.startswith("```"):
1392
+ response = re.sub(r"^```(json)?", "", response)
1393
+ response = re.sub(r"```$", "", response).strip()
1394
+ response = response.strip()
1395
+ # Optionally remove any start/end markers like "json" if present:
1396
+ if response.lower().startswith("json"):
1397
+ response = response[4:].strip()
1398
+ try:
1399
+ result = json.loads(response)
1400
+ return result
1401
+ except json.JSONDecodeError:
1402
+ # If JSON parsing fails, try to extract the information using regex
1403
+ logging.warning("JSON parsing failed, attempting regex extraction")
1404
+ relevance_match = re.search(r'"relevant":\s*"(yes|no)"', response, re.IGNORECASE)
1405
+ relevance = relevance_match.group(1) if relevance_match else "no"
1406
+
1407
+ # Extract follow-up queries using regex
1408
+ followups_match = re.search(r'"followups":\s*\[(.*?)\]', response, re.DOTALL)
1409
+ followups = []
1410
+ if followups_match:
1411
+ followups_text = followups_match.group(1)
1412
+ followups = [q.strip().strip('"\'') for q in re.findall(r'"([^"]*)"', followups_text)]
1413
+
1414
+ # Extract summary (everything else)
1415
+ summary = response
1416
+
1417
+ return {
1418
+ "relevant": relevance,
1419
+ "summary": summary,
1420
+ "followups": followups
1421
+ }
1422
+
1423
  except Exception as e:
1424
  logging.error(f"analyze_with_gpt4o error: {e}")
1425
  return {"relevant": "no", "summary": "", "followups": []}
 
2386
  title = res.get("title", "No Title")
2387
  if not url:
2388
  continue
2389
+
2390
+ raw_content = process_url(url)
2391
+ if raw_content.startswith("Error"):
2392
+ process_log += f"{raw_content}\n"
2393
+ continue
2394
+
 
 
 
 
 
 
 
 
 
2395
  # Skip processing if raw_content is empty or too short (< 1000 characters)
2396
+ if not raw_content or len(raw_content) < 1000 or "could not be extracted" in raw_content.lower():
2397
  process_log += f"Content from {url} is either an error or too short (<1000 characters), skipping.\n"
2398
  continue
2399
 
2400
+ process_log += f"Successfully extracted content from {url}\n"
2401
+
2402
  # 1) Clean and do minimal parse
2403
  cleaned_html = clean_content(raw_content)
2404
  # 2) Extract structured data