Update app.py
Browse files
app.py
CHANGED
|
@@ -197,8 +197,10 @@ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl
|
|
| 197 |
print(f"Found {len(result_block)} results on this page")
|
| 198 |
for result in result_block:
|
| 199 |
link = result.find("a", href=True)
|
| 200 |
-
|
|
|
|
| 201 |
link = link["href"]
|
|
|
|
| 202 |
print(f"Processing link: {link}")
|
| 203 |
try:
|
| 204 |
webpage = session.get(link, headers=headers, timeout=timeout)
|
|
@@ -206,20 +208,21 @@ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl
|
|
| 206 |
visible_text = extract_text_from_webpage(webpage.text)
|
| 207 |
if len(visible_text) > max_chars_per_page:
|
| 208 |
visible_text = visible_text[:max_chars_per_page] + "..."
|
| 209 |
-
all_results.append({"link": link, "text": visible_text})
|
| 210 |
print(f"Successfully extracted text from {link}")
|
| 211 |
except requests.exceptions.RequestException as e:
|
| 212 |
print(f"Error retrieving webpage content: {e}")
|
| 213 |
-
all_results.append({"link": link, "text": None})
|
| 214 |
else:
|
| 215 |
-
print("No link found for this result")
|
| 216 |
-
all_results.append({"link": None, "text": None})
|
| 217 |
start += len(result_block)
|
| 218 |
|
| 219 |
print(f"Search completed. Total results: {len(all_results)}")
|
| 220 |
print("Search results:")
|
| 221 |
for i, result in enumerate(all_results, 1):
|
| 222 |
print(f"Result {i}:")
|
|
|
|
| 223 |
print(f" Link: {result['link']}")
|
| 224 |
if result['text']:
|
| 225 |
print(f" Text: {result['text'][:100]}...") # Print first 100 characters
|
|
@@ -229,11 +232,14 @@ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl
|
|
| 229 |
|
| 230 |
if not all_results:
|
| 231 |
print("No search results found. Returning a default message.")
|
| 232 |
-
return [{"link": None, "text": "No information found in the web search results."}]
|
| 233 |
|
| 234 |
return all_results
|
| 235 |
|
| 236 |
def summarize_content(content, model):
|
|
|
|
|
|
|
|
|
|
| 237 |
# Approximate the token limit using character count
|
| 238 |
# Assuming an average of 4 characters per token
|
| 239 |
max_chars = 7000 * 4 # Leave some room for the prompt
|
|
@@ -282,32 +288,38 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
|
|
| 282 |
|
| 283 |
if web_search:
|
| 284 |
search_results = google_search(question)
|
| 285 |
-
model = get_model(temperature, top_p, repetition_penalty)
|
| 286 |
|
| 287 |
-
|
| 288 |
-
for result in search_results:
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
context_str = combined_summaries
|
| 304 |
-
titles = [result["title"] for result in search_results]
|
| 305 |
ranks = rank_search_results(titles, summaries, model)
|
| 306 |
|
| 307 |
-
|
|
|
|
|
|
|
| 308 |
|
| 309 |
-
|
| 310 |
-
|
|
|
|
| 311 |
|
| 312 |
prompt_template = """
|
| 313 |
Answer the question based on the following web search results:
|
|
|
|
| 197 |
print(f"Found {len(result_block)} results on this page")
|
| 198 |
for result in result_block:
|
| 199 |
link = result.find("a", href=True)
|
| 200 |
+
title = result.find("h3")
|
| 201 |
+
if link and title:
|
| 202 |
link = link["href"]
|
| 203 |
+
title = title.get_text()
|
| 204 |
print(f"Processing link: {link}")
|
| 205 |
try:
|
| 206 |
webpage = session.get(link, headers=headers, timeout=timeout)
|
|
|
|
| 208 |
visible_text = extract_text_from_webpage(webpage.text)
|
| 209 |
if len(visible_text) > max_chars_per_page:
|
| 210 |
visible_text = visible_text[:max_chars_per_page] + "..."
|
| 211 |
+
all_results.append({"link": link, "title": title, "text": visible_text})
|
| 212 |
print(f"Successfully extracted text from {link}")
|
| 213 |
except requests.exceptions.RequestException as e:
|
| 214 |
print(f"Error retrieving webpage content: {e}")
|
| 215 |
+
all_results.append({"link": link, "title": title, "text": None})
|
| 216 |
else:
|
| 217 |
+
print("No link or title found for this result")
|
| 218 |
+
all_results.append({"link": None, "title": None, "text": None})
|
| 219 |
start += len(result_block)
|
| 220 |
|
| 221 |
print(f"Search completed. Total results: {len(all_results)}")
|
| 222 |
print("Search results:")
|
| 223 |
for i, result in enumerate(all_results, 1):
|
| 224 |
print(f"Result {i}:")
|
| 225 |
+
print(f" Title: {result['title']}")
|
| 226 |
print(f" Link: {result['link']}")
|
| 227 |
if result['text']:
|
| 228 |
print(f" Text: {result['text'][:100]}...") # Print first 100 characters
|
|
|
|
| 232 |
|
| 233 |
if not all_results:
|
| 234 |
print("No search results found. Returning a default message.")
|
| 235 |
+
return [{"link": None, "title": "No Results", "text": "No information found in the web search results."}]
|
| 236 |
|
| 237 |
return all_results
|
| 238 |
|
| 239 |
def summarize_content(content, model):
|
| 240 |
+
if content is None:
|
| 241 |
+
return "No content available to summarize."
|
| 242 |
+
|
| 243 |
# Approximate the token limit using character count
|
| 244 |
# Assuming an average of 4 characters per token
|
| 245 |
max_chars = 7000 * 4 # Leave some room for the prompt
|
|
|
|
| 288 |
|
| 289 |
if web_search:
|
| 290 |
search_results = google_search(question)
|
|
|
|
| 291 |
|
| 292 |
+
processed_results = []
|
| 293 |
+
for index, result in enumerate(search_results, start=1):
|
| 294 |
+
if result["text"] is not None:
|
| 295 |
+
try:
|
| 296 |
+
summary = summarize_content(result["text"], model)
|
| 297 |
+
processed_results.append({
|
| 298 |
+
"title": result.get("title", f"Result {index}"),
|
| 299 |
+
"content": result["text"],
|
| 300 |
+
"summary": summary,
|
| 301 |
+
"index": index
|
| 302 |
+
})
|
| 303 |
+
except Exception as e:
|
| 304 |
+
print(f"Error processing search result {index}: {str(e)}")
|
| 305 |
+
else:
|
| 306 |
+
print(f"Skipping result {index} due to None content")
|
| 307 |
|
| 308 |
+
if not processed_results:
|
| 309 |
+
return "No valid search results found."
|
| 310 |
+
|
| 311 |
+
# Rank the results
|
| 312 |
+
titles = [r["title"] for r in processed_results]
|
| 313 |
+
summaries = [r["summary"] for r in processed_results]
|
|
|
|
|
|
|
|
|
|
| 314 |
ranks = rank_search_results(titles, summaries, model)
|
| 315 |
|
| 316 |
+
# Update Vector DB
|
| 317 |
+
current_date = datetime.now().strftime("%Y-%m-%d")
|
| 318 |
+
update_vector_db_with_search_results(processed_results, ranks, current_date)
|
| 319 |
|
| 320 |
+
# Prepare context for the question
|
| 321 |
+
context_str = "\n\n".join([f"Title: {r['title']}\nSummary: {r['summary']}\nRank: {ranks[i]}"
|
| 322 |
+
for i, r in enumerate(processed_results)])
|
| 323 |
|
| 324 |
prompt_template = """
|
| 325 |
Answer the question based on the following web search results:
|