Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on Apr 21, 2025

Commit

56e3a38

1 Parent(s): 963ab6b

fix: duckduckgo search index error; improved prompts

Browse files

Files changed (2) hide show

backend/knet.py +41 -27
backend/scraper.py +9 -2

backend/knet.py CHANGED Viewed

@@ -25,27 +25,35 @@ class Prompt:
     def __init__(self) -> None:
         self.research_plan = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
-        User query:
-        "{topic}".
         ---
         Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
         Return a string array of steps.""")
         self.site_summary = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
-        Findings:
-        {findings}""")
         self.continue_branch = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
-        Global Research Plan:
         {research_plan}
         Current Topic: {query}
-        Searched Queries:
         {past_queries}
-        Findings under current topic:
         {ctx_manager}
         Consider:
         - Information saturation
@@ -56,56 +64,62 @@ class Prompt:
         Return only decision: true/false""")
         self.search_query = dedent("""Based on the following findings on topic {vertical}, create google search queries
-        Global Research Plan:
         {research_plan}
-        Searched queries:
         {past_queries}
-        Findings under current topic:
         {ctx_manager}
-        Suggest up to {n} specific google search queries that:
         - Covers what has not been covered yet
         - Builds upon these findings
         - Explores different aspects
         - Goes deeper into important details
         - Do not do quote searches
-        - Queries should be generic and short.
         Return as JSON array of objects with properties:
         - query (string)""")
-        self.report_outline = dedent("""Generate a comprehensive outline for a report based on the findings:
-        Original user query for your context:
         {topic}
-        Findings:
         {ctx_manager}
-        If there are multiple comparisons, only create one heading for all.
         The outline should include:
         - Title
         - List of h2 headings
         Do not include hashtags""")
-        self.report_fillin = dedent("""Fill in the content for the following report outline based on the following research findings:
-        Original user query for your context:
-        {topic}
-        Findings:
         {ctx_manager}
-        The outline:
         {report_outline}
-        Report generated so far:
-        {report_progress}
-        Current heading to fill in:
         ## {slot}
-        The content should be comprehensive, detailed and well-structured, providing detailed information on the topic.
         If needed use tables, lists. Do not include subheadings.
         Do not include the heading in the content.
         """)

     def __init__(self) -> None:
         self.research_plan = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
+        <User query>
+        {topic}
+        </User query>
         ---
         Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
+        Do not presume any knowledge about the topic.
         Return a string array of steps.""")
         self.site_summary = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
+        <Findings>
+        {findings}
+        </Findings>
+        """)
         self.continue_branch = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
+        <Global Research Plan>
         {research_plan}
+        </Global Research Plan>
         Current Topic: {query}
+        <Past Searched Queries>
         {past_queries}
+        </Past Searched Queries>
+        <Findings under current topic>
         {ctx_manager}
+        </Findings under current topic>
         Consider:
         - Information saturation
         Return only decision: true/false""")
         self.search_query = dedent("""Based on the following findings on topic {vertical}, create google search queries
+        <Global Research Plan>
         {research_plan}
+        </Global Research Plan>
+        <Past Searched Queries>
         {past_queries}
+        </Past Searched Queries>
+        <Findings under current topic>
         {ctx_manager}
+        </Findings under current topic>
+        Suggest {n} specific google search queries that:
         - Covers what has not been covered yet
         - Builds upon these findings
         - Explores different aspects
         - Goes deeper into important details
         - Do not do quote searches
+        - Queries should be generic and short
+        - Do not presume any knowledge about the topic
         Return as JSON array of objects with properties:
         - query (string)""")
+        self.report_outline = dedent("""Generate a outline for a report based on the findings:
+        <Original user query>
         {topic}
+        </Original user query>
+        <Findings>
         {ctx_manager}
+        </Findings>
+        Deduplicate, reorganize and analyze the findings to create the outline.
+        If there are multiple comparisons, use a table instead of multiple headings.
         The outline should include:
         - Title
         - List of h2 headings
         Do not include hashtags""")
+        self.report_fillin = dedent("""Fill in the content for the current outline heading based on the findings:
+        <Findings>
         {ctx_manager}
+        </Findings>
+        <The outline>
         {report_outline}
+        </The outline>
+        <Current outline heading to fill in>
         ## {slot}
+        ...
+        </Current outline heading to fill in>
+        Assume [done] headings have their respective content.
+        The content should be comprehensive, detailed and well-structured, providing detailed information on current heading.
         If needed use tables, lists. Do not include subheadings.
         Do not include the heading in the content.
         """)

backend/scraper.py CHANGED Viewed

@@ -83,11 +83,12 @@ class CrawlForAIScraper:
                     continue
                 search_results.append(url)
-            for i in range(3):
                 if not search_results:
                     self.logger.info("Performing DuckDuckGo search as fallback...")
                     self.logger.warning("No search results found.")
                     search_results = self._duckduckgo_search(query)
             self.logger.info(f"Found {len(search_results)} results")
             return search_results
@@ -102,7 +103,13 @@ class CrawlForAIScraper:
             encoded_query = quote_plus(query)
             url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
-            response = self.session.get(url, headers=self.headers, timeout=self.timeout)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, "html.parser")

                     continue
                 search_results.append(url)
+            for _ in range(3):
                 if not search_results:
                     self.logger.info("Performing DuckDuckGo search as fallback...")
                     self.logger.warning("No search results found.")
                     search_results = self._duckduckgo_search(query)
+                    break
             self.logger.info(f"Found {len(search_results)} results")
             return search_results
             encoded_query = quote_plus(query)
             url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
+            response = self.session.get(
+                url,
+                headers={
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+                },
+                timeout=10,
+            )
             response.raise_for_status()
             soup = BeautifulSoup(response.text, "html.parser")