Spaces:

Prof-Reza
/

course-creator

Runtime error

App Files Files Community

Prof-Reza commited on Aug 11, 2025

Commit

f2c945e

verified ·

1 Parent(s): d33d74b

Improve search query extraction and domain filtering; unify requirements and enable PDF extraction

Browse files

This update refines the agent's web search pipeline: it extracts queries more intelligently by stripping trigger phrases and optional counts, selecting quoted or topic phrases after 'about' or 'on', and discarding trailing instructions. It passes a list of allowed domains (e.g. .edu, .org, .gov, arxiv.org, kdnuggets.com, towardsdatascience.com, datacamp.com, medium.com) to the search API to prioritise reputable sources. The requirements file has been cleaned to remove duplicate youtube-transcript-api lines and includes dependencies needed for PDF extraction (PyPDF2). These improvements help preserve conversation context, avoid hallucinations, and support reading online PDFs.

Files changed (2) hide show

app.py +56 -9
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -165,27 +165,74 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache,
         # Determine if the user is requesting a web search. If so, perform the search instead
         # of calling the language model. This allows the assistant to fetch resources when
         # the user asks the agent to "search" or "search the internet".
-        search_triggers = ["search", "internet search", "web search"]
         lower_msg = user_message.lower().strip()
-        # Determine if a search should be performed
-        do_search = any(lower_msg.startswith(trig) for trig in search_triggers)
         if do_search:
-            # Extract query after trigger word if present (e.g. "search vibe coding" -> "vibe coding")
-            # Otherwise use the full message minus the trigger
             query = user_message
             for trig in search_triggers:
                 if lower_msg.startswith(trig):
-                    # Remove the trigger from the start of the query string
-                    query = user_message[len(trig):].strip() or user_message
                     break
             try:
                 # Use cached search results if available for this query key (case-insensitive)
                 query_key = query.lower()
                 if query_key in resource_cache:
                     search_results = resource_cache[query_key]
                 else:
-                    # Use our wrapped web_search for better domain filtering and consistent return type
-                    search_results = web_search(query, max_results=5)
                     resource_cache[query_key] = search_results
                 # Iterate over search results, fetch their content, cache resources and summarise
                 summaries = []

         # Determine if the user is requesting a web search. If so, perform the search instead
         # of calling the language model. This allows the assistant to fetch resources when
         # the user asks the agent to "search" or "search the internet".
+        # A message triggers a search if it explicitly asks to search or find articles.
+        # We check for common phrases like "search", "find" combined with "articles" or "resources".
+        search_triggers = [
+            "search",
+            "internet search",
+            "web search",
+            "find articles",
+            "find 5 articles",
+            "find five articles",
+            "find resources",
+        ]
         lower_msg = user_message.lower().strip()
+        # Determine if a search should be performed:
+        # if the message contains the word "search" anywhere, or contains "find" and "article".
+        do_search = False
+        if any(trig in lower_msg for trig in search_triggers):
+            do_search = True
+        elif "find" in lower_msg and ("article" in lower_msg or "articles" in lower_msg or "resource" in lower_msg):
+            do_search = True
         if do_search:
+            # Determine the query string from the user's message.
+            # We remove a leading search trigger phrase if present (e.g. "search", "find articles").
             query = user_message
+            removed = False
             for trig in search_triggers:
                 if lower_msg.startswith(trig):
+                    # Drop the trigger prefix and any surrounding punctuation
+                    query = user_message[len(trig):].strip()
+                    removed = True
                     break
+            # If the message starts with "find", remove "find" and any optional number + article/resource words
+            if not removed and lower_msg.startswith("find"):
+                import re
+                pattern = r"^find\s+(?:\d+\s+)?(?:articles?|resources?)\s*"
+                query = re.sub(pattern, "", user_message, flags=re.IGNORECASE).strip()
+            # Further clean the query by extracting quoted phrases or topic descriptors.
+            import re as _re
+            # If the query contains quoted text, use the quoted portion as the search term
+            match = _re.search(r"[\"']([^\"']+)[\"']", query)
+            if match:
+                query = match.group(1).strip()
+            else:
+                # Look for phrases following 'about' or 'on' as a topic indicator
+                m2 = _re.search(r"\b(?:about|on)\s+([^.,;!?]+)", query, flags=_re.IGNORECASE)
+                if m2:
+                    query = m2.group(1).strip()
+            # Remove trailing instructions like 'provide summaries' etc.
+            # Discard anything after a directive word such as 'summarize', 'summaries', 'provide', or 'examples'
+            query = _re.split(r"\b(?:summarize|summaries|provide|examples|use cases|case studies)\b", query, maxsplit=1, flags=_re.IGNORECASE)[0].strip() or query
             try:
                 # Use cached search results if available for this query key (case-insensitive)
                 query_key = query.lower()
                 if query_key in resource_cache:
                     search_results = resource_cache[query_key]
                 else:
+                    # Use our wrapped web_search for better domain filtering and consistent return type.
+                    # We pass a list of allowed domains to prefer reputable sources (e.g. .edu, .org, .gov and some tech blogs).
+                    allowed_domains = [
+                        ".edu",
+                        ".org",
+                        ".gov",
+                        "arxiv.org",
+                        "kdnuggets.com",
+                        "towardsdatascience.com",
+                        "datacamp.com",
+                        "medium.com",
+                    ]
+                    search_results = web_search(query, max_results=5, allowed_domains=allowed_domains)
                     resource_cache[query_key] = search_results
                 # Iterate over search results, fetch their content, cache resources and summarise
                 summaries = []

requirements.txt CHANGED Viewed

@@ -7,9 +7,10 @@ python-dotenv>=1.0.0
 # Dependencies for scraping and parsing web pages
 requests>=2.31.0
 beautifulsoup4>=4.12.0
-# Allow the agent to fetch YouTube video transcripts for summarization
 youtube-transcript-api>=1.0.0
 # Required to generate Word documents from course outlines
 python-docx>=1.1.0

 # Dependencies for scraping and parsing web pages
 requests>=2.31.0
 beautifulsoup4>=4.12.0
+# Use a single version line for youtube-transcript-api. We require at least 1.0.0
 youtube-transcript-api>=1.0.0
+# Library for extracting text from PDFs
+PyPDF2>=3.0.0
 # Required to generate Word documents from course outlines
 python-docx>=1.1.0