Soham Waghmare commited on
Commit
56e3a38
·
1 Parent(s): 963ab6b

fix: duckduckgo search index error; improved prompts

Browse files
Files changed (2) hide show
  1. backend/knet.py +41 -27
  2. backend/scraper.py +9 -2
backend/knet.py CHANGED
@@ -25,27 +25,35 @@ class Prompt:
25
  def __init__(self) -> None:
26
  self.research_plan = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
27
 
28
- User query:
29
- "{topic}".
 
30
 
31
  ---
32
  Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
 
33
  Return a string array of steps.""")
34
 
35
  self.site_summary = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
36
- Findings:
37
- {findings}""")
 
 
38
 
39
  self.continue_branch = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
40
- Global Research Plan:
41
  {research_plan}
 
42
 
43
  Current Topic: {query}
44
- Searched Queries:
 
45
  {past_queries}
 
46
 
47
- Findings under current topic:
48
  {ctx_manager}
 
49
 
50
  Consider:
51
  - Information saturation
@@ -56,56 +64,62 @@ class Prompt:
56
  Return only decision: true/false""")
57
 
58
  self.search_query = dedent("""Based on the following findings on topic {vertical}, create google search queries
59
- Global Research Plan:
60
  {research_plan}
 
61
 
62
- Searched queries:
63
  {past_queries}
 
64
 
65
- Findings under current topic:
66
  {ctx_manager}
 
67
 
68
- Suggest up to {n} specific google search queries that:
69
  - Covers what has not been covered yet
70
  - Builds upon these findings
71
  - Explores different aspects
72
  - Goes deeper into important details
73
 
74
  - Do not do quote searches
75
- - Queries should be generic and short.
 
76
  Return as JSON array of objects with properties:
77
  - query (string)""")
78
 
79
- self.report_outline = dedent("""Generate a comprehensive outline for a report based on the findings:
80
- Original user query for your context:
81
  {topic}
 
82
 
83
- Findings:
84
  {ctx_manager}
 
85
 
86
- If there are multiple comparisons, only create one heading for all.
 
87
  The outline should include:
88
  - Title
89
  - List of h2 headings
90
  Do not include hashtags""")
91
 
92
- self.report_fillin = dedent("""Fill in the content for the following report outline based on the following research findings:
93
- Original user query for your context:
94
- {topic}
95
-
96
- Findings:
97
  {ctx_manager}
 
98
 
99
- The outline:
100
  {report_outline}
 
101
 
102
- Report generated so far:
103
- {report_progress}
104
-
105
- Current heading to fill in:
106
  ## {slot}
 
 
107
 
108
- The content should be comprehensive, detailed and well-structured, providing detailed information on the topic.
 
109
  If needed use tables, lists. Do not include subheadings.
110
  Do not include the heading in the content.
111
  """)
 
25
  def __init__(self) -> None:
26
  self.research_plan = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
27
 
28
+ <User query>
29
+ {topic}
30
+ </User query>
31
 
32
  ---
33
  Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
34
+ Do not presume any knowledge about the topic.
35
  Return a string array of steps.""")
36
 
37
  self.site_summary = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
38
+ <Findings>
39
+ {findings}
40
+ </Findings>
41
+ """)
42
 
43
  self.continue_branch = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
44
+ <Global Research Plan>
45
  {research_plan}
46
+ </Global Research Plan>
47
 
48
  Current Topic: {query}
49
+
50
+ <Past Searched Queries>
51
  {past_queries}
52
+ </Past Searched Queries>
53
 
54
+ <Findings under current topic>
55
  {ctx_manager}
56
+ </Findings under current topic>
57
 
58
  Consider:
59
  - Information saturation
 
64
  Return only decision: true/false""")
65
 
66
  self.search_query = dedent("""Based on the following findings on topic {vertical}, create google search queries
67
+ <Global Research Plan>
68
  {research_plan}
69
+ </Global Research Plan>
70
 
71
+ <Past Searched Queries>
72
  {past_queries}
73
+ </Past Searched Queries>
74
 
75
+ <Findings under current topic>
76
  {ctx_manager}
77
+ </Findings under current topic>
78
 
79
+ Suggest {n} specific google search queries that:
80
  - Covers what has not been covered yet
81
  - Builds upon these findings
82
  - Explores different aspects
83
  - Goes deeper into important details
84
 
85
  - Do not do quote searches
86
+ - Queries should be generic and short
87
+ - Do not presume any knowledge about the topic
88
  Return as JSON array of objects with properties:
89
  - query (string)""")
90
 
91
+ self.report_outline = dedent("""Generate a outline for a report based on the findings:
92
+ <Original user query>
93
  {topic}
94
+ </Original user query>
95
 
96
+ <Findings>
97
  {ctx_manager}
98
+ </Findings>
99
 
100
+ Deduplicate, reorganize and analyze the findings to create the outline.
101
+ If there are multiple comparisons, use a table instead of multiple headings.
102
  The outline should include:
103
  - Title
104
  - List of h2 headings
105
  Do not include hashtags""")
106
 
107
+ self.report_fillin = dedent("""Fill in the content for the current outline heading based on the findings:
108
+ <Findings>
 
 
 
109
  {ctx_manager}
110
+ </Findings>
111
 
112
+ <The outline>
113
  {report_outline}
114
+ </The outline>
115
 
116
+ <Current outline heading to fill in>
 
 
 
117
  ## {slot}
118
+ ...
119
+ </Current outline heading to fill in>
120
 
121
+ Assume [done] headings have their respective content.
122
+ The content should be comprehensive, detailed and well-structured, providing detailed information on current heading.
123
  If needed use tables, lists. Do not include subheadings.
124
  Do not include the heading in the content.
125
  """)
backend/scraper.py CHANGED
@@ -83,11 +83,12 @@ class CrawlForAIScraper:
83
  continue
84
  search_results.append(url)
85
 
86
- for i in range(3):
87
  if not search_results:
88
  self.logger.info("Performing DuckDuckGo search as fallback...")
89
  self.logger.warning("No search results found.")
90
  search_results = self._duckduckgo_search(query)
 
91
 
92
  self.logger.info(f"Found {len(search_results)} results")
93
  return search_results
@@ -102,7 +103,13 @@ class CrawlForAIScraper:
102
  encoded_query = quote_plus(query)
103
  url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
104
 
105
- response = self.session.get(url, headers=self.headers, timeout=self.timeout)
 
 
 
 
 
 
106
  response.raise_for_status()
107
 
108
  soup = BeautifulSoup(response.text, "html.parser")
 
83
  continue
84
  search_results.append(url)
85
 
86
+ for _ in range(3):
87
  if not search_results:
88
  self.logger.info("Performing DuckDuckGo search as fallback...")
89
  self.logger.warning("No search results found.")
90
  search_results = self._duckduckgo_search(query)
91
+ break
92
 
93
  self.logger.info(f"Found {len(search_results)} results")
94
  return search_results
 
103
  encoded_query = quote_plus(query)
104
  url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
105
 
106
+ response = self.session.get(
107
+ url,
108
+ headers={
109
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
110
+ },
111
+ timeout=10,
112
+ )
113
  response.raise_for_status()
114
 
115
  soup = BeautifulSoup(response.text, "html.parser")