Spaces:
Paused
Paused
Soham Waghmare
commited on
Commit
·
56e3a38
1
Parent(s):
963ab6b
fix: duckduckgo search index error; improved prompts
Browse files- backend/knet.py +41 -27
- backend/scraper.py +9 -2
backend/knet.py
CHANGED
|
@@ -25,27 +25,35 @@ class Prompt:
|
|
| 25 |
def __init__(self) -> None:
|
| 26 |
self.research_plan = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
|
| 27 |
|
| 28 |
-
User query
|
| 29 |
-
|
|
|
|
| 30 |
|
| 31 |
---
|
| 32 |
Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
|
|
|
|
| 33 |
Return a string array of steps.""")
|
| 34 |
|
| 35 |
self.site_summary = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
|
| 36 |
-
Findings
|
| 37 |
-
{findings}
|
|
|
|
|
|
|
| 38 |
|
| 39 |
self.continue_branch = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
|
| 40 |
-
Global Research Plan
|
| 41 |
{research_plan}
|
|
|
|
| 42 |
|
| 43 |
Current Topic: {query}
|
| 44 |
-
|
|
|
|
| 45 |
{past_queries}
|
|
|
|
| 46 |
|
| 47 |
-
Findings under current topic
|
| 48 |
{ctx_manager}
|
|
|
|
| 49 |
|
| 50 |
Consider:
|
| 51 |
- Information saturation
|
|
@@ -56,56 +64,62 @@ class Prompt:
|
|
| 56 |
Return only decision: true/false""")
|
| 57 |
|
| 58 |
self.search_query = dedent("""Based on the following findings on topic {vertical}, create google search queries
|
| 59 |
-
Global Research Plan
|
| 60 |
{research_plan}
|
|
|
|
| 61 |
|
| 62 |
-
Searched
|
| 63 |
{past_queries}
|
|
|
|
| 64 |
|
| 65 |
-
Findings under current topic
|
| 66 |
{ctx_manager}
|
|
|
|
| 67 |
|
| 68 |
-
Suggest
|
| 69 |
- Covers what has not been covered yet
|
| 70 |
- Builds upon these findings
|
| 71 |
- Explores different aspects
|
| 72 |
- Goes deeper into important details
|
| 73 |
|
| 74 |
- Do not do quote searches
|
| 75 |
-
- Queries should be generic and short
|
|
|
|
| 76 |
Return as JSON array of objects with properties:
|
| 77 |
- query (string)""")
|
| 78 |
|
| 79 |
-
self.report_outline = dedent("""Generate a
|
| 80 |
-
Original user query
|
| 81 |
{topic}
|
|
|
|
| 82 |
|
| 83 |
-
Findings
|
| 84 |
{ctx_manager}
|
|
|
|
| 85 |
|
| 86 |
-
|
|
|
|
| 87 |
The outline should include:
|
| 88 |
- Title
|
| 89 |
- List of h2 headings
|
| 90 |
Do not include hashtags""")
|
| 91 |
|
| 92 |
-
self.report_fillin = dedent("""Fill in the content for the
|
| 93 |
-
|
| 94 |
-
{topic}
|
| 95 |
-
|
| 96 |
-
Findings:
|
| 97 |
{ctx_manager}
|
|
|
|
| 98 |
|
| 99 |
-
The outline
|
| 100 |
{report_outline}
|
|
|
|
| 101 |
|
| 102 |
-
|
| 103 |
-
{report_progress}
|
| 104 |
-
|
| 105 |
-
Current heading to fill in:
|
| 106 |
## {slot}
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
|
|
|
|
| 109 |
If needed use tables, lists. Do not include subheadings.
|
| 110 |
Do not include the heading in the content.
|
| 111 |
""")
|
|
|
|
| 25 |
def __init__(self) -> None:
|
| 26 |
self.research_plan = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
|
| 27 |
|
| 28 |
+
<User query>
|
| 29 |
+
{topic}
|
| 30 |
+
</User query>
|
| 31 |
|
| 32 |
---
|
| 33 |
Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
|
| 34 |
+
Do not presume any knowledge about the topic.
|
| 35 |
Return a string array of steps.""")
|
| 36 |
|
| 37 |
self.site_summary = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
|
| 38 |
+
<Findings>
|
| 39 |
+
{findings}
|
| 40 |
+
</Findings>
|
| 41 |
+
""")
|
| 42 |
|
| 43 |
self.continue_branch = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
|
| 44 |
+
<Global Research Plan>
|
| 45 |
{research_plan}
|
| 46 |
+
</Global Research Plan>
|
| 47 |
|
| 48 |
Current Topic: {query}
|
| 49 |
+
|
| 50 |
+
<Past Searched Queries>
|
| 51 |
{past_queries}
|
| 52 |
+
</Past Searched Queries>
|
| 53 |
|
| 54 |
+
<Findings under current topic>
|
| 55 |
{ctx_manager}
|
| 56 |
+
</Findings under current topic>
|
| 57 |
|
| 58 |
Consider:
|
| 59 |
- Information saturation
|
|
|
|
| 64 |
Return only decision: true/false""")
|
| 65 |
|
| 66 |
self.search_query = dedent("""Based on the following findings on topic {vertical}, create google search queries
|
| 67 |
+
<Global Research Plan>
|
| 68 |
{research_plan}
|
| 69 |
+
</Global Research Plan>
|
| 70 |
|
| 71 |
+
<Past Searched Queries>
|
| 72 |
{past_queries}
|
| 73 |
+
</Past Searched Queries>
|
| 74 |
|
| 75 |
+
<Findings under current topic>
|
| 76 |
{ctx_manager}
|
| 77 |
+
</Findings under current topic>
|
| 78 |
|
| 79 |
+
Suggest {n} specific google search queries that:
|
| 80 |
- Covers what has not been covered yet
|
| 81 |
- Builds upon these findings
|
| 82 |
- Explores different aspects
|
| 83 |
- Goes deeper into important details
|
| 84 |
|
| 85 |
- Do not do quote searches
|
| 86 |
+
- Queries should be generic and short
|
| 87 |
+
- Do not presume any knowledge about the topic
|
| 88 |
Return as JSON array of objects with properties:
|
| 89 |
- query (string)""")
|
| 90 |
|
| 91 |
+
self.report_outline = dedent("""Generate a outline for a report based on the findings:
|
| 92 |
+
<Original user query>
|
| 93 |
{topic}
|
| 94 |
+
</Original user query>
|
| 95 |
|
| 96 |
+
<Findings>
|
| 97 |
{ctx_manager}
|
| 98 |
+
</Findings>
|
| 99 |
|
| 100 |
+
Deduplicate, reorganize and analyze the findings to create the outline.
|
| 101 |
+
If there are multiple comparisons, use a table instead of multiple headings.
|
| 102 |
The outline should include:
|
| 103 |
- Title
|
| 104 |
- List of h2 headings
|
| 105 |
Do not include hashtags""")
|
| 106 |
|
| 107 |
+
self.report_fillin = dedent("""Fill in the content for the current outline heading based on the findings:
|
| 108 |
+
<Findings>
|
|
|
|
|
|
|
|
|
|
| 109 |
{ctx_manager}
|
| 110 |
+
</Findings>
|
| 111 |
|
| 112 |
+
<The outline>
|
| 113 |
{report_outline}
|
| 114 |
+
</The outline>
|
| 115 |
|
| 116 |
+
<Current outline heading to fill in>
|
|
|
|
|
|
|
|
|
|
| 117 |
## {slot}
|
| 118 |
+
...
|
| 119 |
+
</Current outline heading to fill in>
|
| 120 |
|
| 121 |
+
Assume [done] headings have their respective content.
|
| 122 |
+
The content should be comprehensive, detailed and well-structured, providing detailed information on current heading.
|
| 123 |
If needed use tables, lists. Do not include subheadings.
|
| 124 |
Do not include the heading in the content.
|
| 125 |
""")
|
backend/scraper.py
CHANGED
|
@@ -83,11 +83,12 @@ class CrawlForAIScraper:
|
|
| 83 |
continue
|
| 84 |
search_results.append(url)
|
| 85 |
|
| 86 |
-
for
|
| 87 |
if not search_results:
|
| 88 |
self.logger.info("Performing DuckDuckGo search as fallback...")
|
| 89 |
self.logger.warning("No search results found.")
|
| 90 |
search_results = self._duckduckgo_search(query)
|
|
|
|
| 91 |
|
| 92 |
self.logger.info(f"Found {len(search_results)} results")
|
| 93 |
return search_results
|
|
@@ -102,7 +103,13 @@ class CrawlForAIScraper:
|
|
| 102 |
encoded_query = quote_plus(query)
|
| 103 |
url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
|
| 104 |
|
| 105 |
-
response = self.session.get(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
response.raise_for_status()
|
| 107 |
|
| 108 |
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
| 83 |
continue
|
| 84 |
search_results.append(url)
|
| 85 |
|
| 86 |
+
for _ in range(3):
|
| 87 |
if not search_results:
|
| 88 |
self.logger.info("Performing DuckDuckGo search as fallback...")
|
| 89 |
self.logger.warning("No search results found.")
|
| 90 |
search_results = self._duckduckgo_search(query)
|
| 91 |
+
break
|
| 92 |
|
| 93 |
self.logger.info(f"Found {len(search_results)} results")
|
| 94 |
return search_results
|
|
|
|
| 103 |
encoded_query = quote_plus(query)
|
| 104 |
url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
|
| 105 |
|
| 106 |
+
response = self.session.get(
|
| 107 |
+
url,
|
| 108 |
+
headers={
|
| 109 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 110 |
+
},
|
| 111 |
+
timeout=10,
|
| 112 |
+
)
|
| 113 |
response.raise_for_status()
|
| 114 |
|
| 115 |
soup = BeautifulSoup(response.text, "html.parser")
|