Prof-Reza commited on
Commit
f2c945e
·
verified ·
1 Parent(s): d33d74b

Improve search query extraction and domain filtering; unify requirements and enable PDF extraction

Browse files

This update refines the agent's web search pipeline: it extracts queries more intelligently by stripping trigger phrases and optional counts, selecting quoted or topic phrases after 'about' or 'on', and discarding trailing instructions. It passes a list of allowed domains (e.g. .edu, .org, .gov, arxiv.org, kdnuggets.com, towardsdatascience.com, datacamp.com, medium.com) to the search API to prioritise reputable sources. The requirements file has been cleaned to remove duplicate youtube-transcript-api lines and includes dependencies needed for PDF extraction (PyPDF2). These improvements help preserve conversation context, avoid hallucinations, and support reading online PDFs.

Files changed (2) hide show
  1. app.py +56 -9
  2. requirements.txt +3 -2
app.py CHANGED
@@ -165,27 +165,74 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache,
165
  # Determine if the user is requesting a web search. If so, perform the search instead
166
  # of calling the language model. This allows the assistant to fetch resources when
167
  # the user asks the agent to "search" or "search the internet".
168
- search_triggers = ["search", "internet search", "web search"]
 
 
 
 
 
 
 
 
 
 
169
  lower_msg = user_message.lower().strip()
170
- # Determine if a search should be performed
171
- do_search = any(lower_msg.startswith(trig) for trig in search_triggers)
 
 
 
 
 
172
  if do_search:
173
- # Extract query after trigger word if present (e.g. "search vibe coding" -> "vibe coding")
174
- # Otherwise use the full message minus the trigger
175
  query = user_message
 
176
  for trig in search_triggers:
177
  if lower_msg.startswith(trig):
178
- # Remove the trigger from the start of the query string
179
- query = user_message[len(trig):].strip() or user_message
 
180
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  try:
182
  # Use cached search results if available for this query key (case-insensitive)
183
  query_key = query.lower()
184
  if query_key in resource_cache:
185
  search_results = resource_cache[query_key]
186
  else:
187
- # Use our wrapped web_search for better domain filtering and consistent return type
188
- search_results = web_search(query, max_results=5)
 
 
 
 
 
 
 
 
 
 
 
189
  resource_cache[query_key] = search_results
190
  # Iterate over search results, fetch their content, cache resources and summarise
191
  summaries = []
 
165
  # Determine if the user is requesting a web search. If so, perform the search instead
166
  # of calling the language model. This allows the assistant to fetch resources when
167
  # the user asks the agent to "search" or "search the internet".
168
+ # A message triggers a search if it explicitly asks to search or find articles.
169
+ # We check for common phrases like "search", "find" combined with "articles" or "resources".
170
+ search_triggers = [
171
+ "search",
172
+ "internet search",
173
+ "web search",
174
+ "find articles",
175
+ "find 5 articles",
176
+ "find five articles",
177
+ "find resources",
178
+ ]
179
  lower_msg = user_message.lower().strip()
180
+ # Determine if a search should be performed:
181
+ # if the message contains the word "search" anywhere, or contains "find" and "article".
182
+ do_search = False
183
+ if any(trig in lower_msg for trig in search_triggers):
184
+ do_search = True
185
+ elif "find" in lower_msg and ("article" in lower_msg or "articles" in lower_msg or "resource" in lower_msg):
186
+ do_search = True
187
  if do_search:
188
+ # Determine the query string from the user's message.
189
+ # We remove a leading search trigger phrase if present (e.g. "search", "find articles").
190
  query = user_message
191
+ removed = False
192
  for trig in search_triggers:
193
  if lower_msg.startswith(trig):
194
+ # Drop the trigger prefix and any surrounding punctuation
195
+ query = user_message[len(trig):].strip()
196
+ removed = True
197
  break
198
+ # If the message starts with "find", remove "find" and any optional number + article/resource words
199
+ if not removed and lower_msg.startswith("find"):
200
+ import re
201
+ pattern = r"^find\s+(?:\d+\s+)?(?:articles?|resources?)\s*"
202
+ query = re.sub(pattern, "", user_message, flags=re.IGNORECASE).strip()
203
+ # Further clean the query by extracting quoted phrases or topic descriptors.
204
+ import re as _re
205
+ # If the query contains quoted text, use the quoted portion as the search term
206
+ match = _re.search(r"[\"']([^\"']+)[\"']", query)
207
+ if match:
208
+ query = match.group(1).strip()
209
+ else:
210
+ # Look for phrases following 'about' or 'on' as a topic indicator
211
+ m2 = _re.search(r"\b(?:about|on)\s+([^.,;!?]+)", query, flags=_re.IGNORECASE)
212
+ if m2:
213
+ query = m2.group(1).strip()
214
+ # Remove trailing instructions like 'provide summaries' etc.
215
+ # Discard anything after a directive word such as 'summarize', 'summaries', 'provide', or 'examples'
216
+ query = _re.split(r"\b(?:summarize|summaries|provide|examples|use cases|case studies)\b", query, maxsplit=1, flags=_re.IGNORECASE)[0].strip() or query
217
  try:
218
  # Use cached search results if available for this query key (case-insensitive)
219
  query_key = query.lower()
220
  if query_key in resource_cache:
221
  search_results = resource_cache[query_key]
222
  else:
223
+ # Use our wrapped web_search for better domain filtering and consistent return type.
224
+ # We pass a list of allowed domains to prefer reputable sources (e.g. .edu, .org, .gov and some tech blogs).
225
+ allowed_domains = [
226
+ ".edu",
227
+ ".org",
228
+ ".gov",
229
+ "arxiv.org",
230
+ "kdnuggets.com",
231
+ "towardsdatascience.com",
232
+ "datacamp.com",
233
+ "medium.com",
234
+ ]
235
+ search_results = web_search(query, max_results=5, allowed_domains=allowed_domains)
236
  resource_cache[query_key] = search_results
237
  # Iterate over search results, fetch their content, cache resources and summarise
238
  summaries = []
requirements.txt CHANGED
@@ -7,9 +7,10 @@ python-dotenv>=1.0.0
7
  # Dependencies for scraping and parsing web pages
8
  requests>=2.31.0
9
  beautifulsoup4>=4.12.0
10
-
11
- # Allow the agent to fetch YouTube video transcripts for summarization
12
  youtube-transcript-api>=1.0.0
 
 
13
 
14
  # Required to generate Word documents from course outlines
15
  python-docx>=1.1.0
 
7
  # Dependencies for scraping and parsing web pages
8
  requests>=2.31.0
9
  beautifulsoup4>=4.12.0
10
+ # Use a single version line for youtube-transcript-api. We require at least 1.0.0
 
11
  youtube-transcript-api>=1.0.0
12
+ # Library for extracting text from PDFs
13
+ PyPDF2>=3.0.0
14
 
15
  # Required to generate Word documents from course outlines
16
  python-docx>=1.1.0