Wajahat698 commited on
Commit
50b7896
·
verified ·
1 Parent(s): 87088e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -19
app.py CHANGED
@@ -1097,26 +1097,29 @@ def search_knowledge_base(query):
1097
  # Retrieve the top 5 most relevant documents
1098
  retrieved_docs = st.session_state["faiss_db"].similarity_search(query, k=3)
1099
  return retrieved_docs
1100
-
1101
- async def validate_url(session, url):
1102
  try:
1103
- async with session.head(url, allow_redirects=True, timeout=5) as response:
1104
  if response.status == 200:
1105
- return True
1106
- return False
 
 
1107
  except Exception:
1108
- return False
1109
 
1110
  # Function to validate a batch of URLs asynchronously
1111
- async def validate_links_async(urls):
1112
  async with aiohttp.ClientSession() as session:
1113
- tasks = [validate_url(session, url) for url in urls]
1114
  results = await asyncio.gather(*tasks)
1115
  return results
1116
 
1117
  # Function to perform a Google search and validate links
1118
  def google_search(query):
1119
  try:
 
1120
  search_client = Client(api_key=serper_api_key)
1121
  results = search_client.search({"engine": "google", "q": query})
1122
 
@@ -1125,22 +1128,20 @@ def google_search(query):
1125
  urls = [result.get("link") for result in organic_results if result.get("link")]
1126
  snippets = [result.get("snippet") for result in organic_results if result.get("link")]
1127
 
1128
- # Validate URLs asynchronously
1129
- valid_links = asyncio.run(validate_links_async(urls))
1130
-
1131
- # Filter valid snippets and links
1132
- valid_results = [
1133
- {"snippet": snippet, "link": url}
1134
- for snippet, url, is_valid in zip(snippets, urls, valid_links)
1135
- if is_valid
1136
- ]
1137
-
1138
  return valid_results
1139
 
1140
  except Exception as e:
1141
  logger.error(f"Error in Google search: {e}")
1142
  return [{"snippet": "Error occurred during Google search", "link": ""}]
1143
-
1144
  def rag_response(query, selected_doc_ids=None):
1145
  """
1146
  Handle queries by searching both the main knowledge base and the selected documents.
 
1097
  # Retrieve the top 5 most relevant documents
1098
  retrieved_docs = st.session_state["faiss_db"].similarity_search(query, k=3)
1099
  return retrieved_docs
1100
+ # Asynchronous function to validate and verify link content
1101
+ async def validate_and_fetch_content(session, url, query):
1102
  try:
1103
+ async with session.get(url, allow_redirects=True, timeout=10) as response:
1104
  if response.status == 200:
1105
+ content = await response.text()
1106
+ if query.lower() in content.lower():
1107
+ return {"url": url, "valid": True, "contains_query": True}
1108
+ return {"url": url, "valid": True, "contains_query": False}
1109
  except Exception:
1110
+ return {"url": url, "valid": False, "contains_query": False}
1111
 
1112
  # Function to validate a batch of URLs asynchronously
1113
+ async def validate_links_with_content(urls, query):
1114
  async with aiohttp.ClientSession() as session:
1115
+ tasks = [validate_and_fetch_content(session, url, query) for url in urls]
1116
  results = await asyncio.gather(*tasks)
1117
  return results
1118
 
1119
  # Function to perform a Google search and validate links
1120
  def google_search(query):
1121
  try:
1122
+ # Step 1: Perform the search using SERP API
1123
  search_client = Client(api_key=serper_api_key)
1124
  results = search_client.search({"engine": "google", "q": query})
1125
 
 
1128
  urls = [result.get("link") for result in organic_results if result.get("link")]
1129
  snippets = [result.get("snippet") for result in organic_results if result.get("link")]
1130
 
1131
+ # Step 2: Validate and verify link content
1132
+ validated_links = asyncio.run(validate_links_with_content(urls, query))
1133
+
1134
+ # Step 3: Combine valid snippets and URLs with verified content
1135
+ valid_results = []
1136
+ for snippet, validation in zip(snippets, validated_links):
1137
+ if validation["valid"] and validation["contains_query"]:
1138
+ valid_results.append({"snippet": snippet, "link": validation["url"]})
1139
+
 
1140
  return valid_results
1141
 
1142
  except Exception as e:
1143
  logger.error(f"Error in Google search: {e}")
1144
  return [{"snippet": "Error occurred during Google search", "link": ""}]
 
1145
  def rag_response(query, selected_doc_ids=None):
1146
  """
1147
  Handle queries by searching both the main knowledge base and the selected documents.