Spaces:
Runtime error
Runtime error
Improve search query extraction and domain filtering; unify requirements and enable PDF extraction
Browse filesThis update refines the agent's web search pipeline: it extracts queries more intelligently by stripping trigger phrases and optional counts, selecting quoted or topic phrases after 'about' or 'on', and discarding trailing instructions. It passes a list of allowed domains (e.g. .edu, .org, .gov, arxiv.org, kdnuggets.com, towardsdatascience.com, datacamp.com, medium.com) to the search API to prioritise reputable sources. The requirements file has been cleaned to remove duplicate youtube-transcript-api lines and includes dependencies needed for PDF extraction (PyPDF2). These improvements help preserve conversation context, avoid hallucinations, and support reading online PDFs.
- app.py +56 -9
- requirements.txt +3 -2
app.py
CHANGED
|
@@ -165,27 +165,74 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache,
|
|
| 165 |
# Determine if the user is requesting a web search. If so, perform the search instead
|
| 166 |
# of calling the language model. This allows the assistant to fetch resources when
|
| 167 |
# the user asks the agent to "search" or "search the internet".
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
lower_msg = user_message.lower().strip()
|
| 170 |
-
# Determine if a search should be performed
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
if do_search:
|
| 173 |
-
#
|
| 174 |
-
#
|
| 175 |
query = user_message
|
|
|
|
| 176 |
for trig in search_triggers:
|
| 177 |
if lower_msg.startswith(trig):
|
| 178 |
-
#
|
| 179 |
-
query = user_message[len(trig):].strip()
|
|
|
|
| 180 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
try:
|
| 182 |
# Use cached search results if available for this query key (case-insensitive)
|
| 183 |
query_key = query.lower()
|
| 184 |
if query_key in resource_cache:
|
| 185 |
search_results = resource_cache[query_key]
|
| 186 |
else:
|
| 187 |
-
# Use our wrapped web_search for better domain filtering and consistent return type
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
resource_cache[query_key] = search_results
|
| 190 |
# Iterate over search results, fetch their content, cache resources and summarise
|
| 191 |
summaries = []
|
|
|
|
| 165 |
# Determine if the user is requesting a web search. If so, perform the search instead
|
| 166 |
# of calling the language model. This allows the assistant to fetch resources when
|
| 167 |
# the user asks the agent to "search" or "search the internet".
|
| 168 |
+
# A message triggers a search if it explicitly asks to search or find articles.
|
| 169 |
+
# We check for common phrases like "search", "find" combined with "articles" or "resources".
|
| 170 |
+
search_triggers = [
|
| 171 |
+
"search",
|
| 172 |
+
"internet search",
|
| 173 |
+
"web search",
|
| 174 |
+
"find articles",
|
| 175 |
+
"find 5 articles",
|
| 176 |
+
"find five articles",
|
| 177 |
+
"find resources",
|
| 178 |
+
]
|
| 179 |
lower_msg = user_message.lower().strip()
|
| 180 |
+
# Determine if a search should be performed:
|
| 181 |
+
# if the message contains the word "search" anywhere, or contains "find" and "article".
|
| 182 |
+
do_search = False
|
| 183 |
+
if any(trig in lower_msg for trig in search_triggers):
|
| 184 |
+
do_search = True
|
| 185 |
+
elif "find" in lower_msg and ("article" in lower_msg or "articles" in lower_msg or "resource" in lower_msg):
|
| 186 |
+
do_search = True
|
| 187 |
if do_search:
|
| 188 |
+
# Determine the query string from the user's message.
|
| 189 |
+
# We remove a leading search trigger phrase if present (e.g. "search", "find articles").
|
| 190 |
query = user_message
|
| 191 |
+
removed = False
|
| 192 |
for trig in search_triggers:
|
| 193 |
if lower_msg.startswith(trig):
|
| 194 |
+
# Drop the trigger prefix and any surrounding punctuation
|
| 195 |
+
query = user_message[len(trig):].strip()
|
| 196 |
+
removed = True
|
| 197 |
break
|
| 198 |
+
# If the message starts with "find", remove "find" and any optional number + article/resource words
|
| 199 |
+
if not removed and lower_msg.startswith("find"):
|
| 200 |
+
import re
|
| 201 |
+
pattern = r"^find\s+(?:\d+\s+)?(?:articles?|resources?)\s*"
|
| 202 |
+
query = re.sub(pattern, "", user_message, flags=re.IGNORECASE).strip()
|
| 203 |
+
# Further clean the query by extracting quoted phrases or topic descriptors.
|
| 204 |
+
import re as _re
|
| 205 |
+
# If the query contains quoted text, use the quoted portion as the search term
|
| 206 |
+
match = _re.search(r"[\"']([^\"']+)[\"']", query)
|
| 207 |
+
if match:
|
| 208 |
+
query = match.group(1).strip()
|
| 209 |
+
else:
|
| 210 |
+
# Look for phrases following 'about' or 'on' as a topic indicator
|
| 211 |
+
m2 = _re.search(r"\b(?:about|on)\s+([^.,;!?]+)", query, flags=_re.IGNORECASE)
|
| 212 |
+
if m2:
|
| 213 |
+
query = m2.group(1).strip()
|
| 214 |
+
# Remove trailing instructions like 'provide summaries' etc.
|
| 215 |
+
# Discard anything after a directive word such as 'summarize', 'summaries', 'provide', or 'examples'
|
| 216 |
+
query = _re.split(r"\b(?:summarize|summaries|provide|examples|use cases|case studies)\b", query, maxsplit=1, flags=_re.IGNORECASE)[0].strip() or query
|
| 217 |
try:
|
| 218 |
# Use cached search results if available for this query key (case-insensitive)
|
| 219 |
query_key = query.lower()
|
| 220 |
if query_key in resource_cache:
|
| 221 |
search_results = resource_cache[query_key]
|
| 222 |
else:
|
| 223 |
+
# Use our wrapped web_search for better domain filtering and consistent return type.
|
| 224 |
+
# We pass a list of allowed domains to prefer reputable sources (e.g. .edu, .org, .gov and some tech blogs).
|
| 225 |
+
allowed_domains = [
|
| 226 |
+
".edu",
|
| 227 |
+
".org",
|
| 228 |
+
".gov",
|
| 229 |
+
"arxiv.org",
|
| 230 |
+
"kdnuggets.com",
|
| 231 |
+
"towardsdatascience.com",
|
| 232 |
+
"datacamp.com",
|
| 233 |
+
"medium.com",
|
| 234 |
+
]
|
| 235 |
+
search_results = web_search(query, max_results=5, allowed_domains=allowed_domains)
|
| 236 |
resource_cache[query_key] = search_results
|
| 237 |
# Iterate over search results, fetch their content, cache resources and summarise
|
| 238 |
summaries = []
|
requirements.txt
CHANGED
|
@@ -7,9 +7,10 @@ python-dotenv>=1.0.0
|
|
| 7 |
# Dependencies for scraping and parsing web pages
|
| 8 |
requests>=2.31.0
|
| 9 |
beautifulsoup4>=4.12.0
|
| 10 |
-
|
| 11 |
-
# Allow the agent to fetch YouTube video transcripts for summarization
|
| 12 |
youtube-transcript-api>=1.0.0
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# Required to generate Word documents from course outlines
|
| 15 |
python-docx>=1.1.0
|
|
|
|
| 7 |
# Dependencies for scraping and parsing web pages
|
| 8 |
requests>=2.31.0
|
| 9 |
beautifulsoup4>=4.12.0
|
| 10 |
+
# Use a single version line for youtube-transcript-api. We require at least 1.0.0
|
|
|
|
| 11 |
youtube-transcript-api>=1.0.0
|
| 12 |
+
# Library for extracting text from PDFs
|
| 13 |
+
PyPDF2>=3.0.0
|
| 14 |
|
| 15 |
# Required to generate Word documents from course outlines
|
| 16 |
python-docx>=1.1.0
|