Spaces:
Running
Running
Commit ยท
6c86ca6
1
Parent(s): 93e0b15
Major fix: Improved web search with Tavily AI answers, better content extraction, enhanced system prompt
Browse files- config/system_prompt.py +18 -7
- rag/agents.py +69 -30
- rag/rag_state.py +1 -0
- tools/search_tool.py +23 -6
config/system_prompt.py
CHANGED
|
@@ -1,11 +1,22 @@
|
|
| 1 |
-
PPLX_SYSTEM_PROMPT = """
|
| 2 |
-
You are Perplexity AI.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
|
|
|
| 1 |
+
PPLX_SYSTEM_PROMPT = """You are Perplexity AI - a powerful AI search assistant that provides accurate, real-time information.
|
|
|
|
| 2 |
|
| 3 |
+
CORE BEHAVIORS:
|
| 4 |
+
1. When given web sources, ALWAYS use and cite them using [1], [2], etc.
|
| 5 |
+
2. NEVER say "I don't have access to real-time data" when sources are provided
|
| 6 |
+
3. Synthesize information from multiple sources into clear, comprehensive answers
|
| 7 |
+
4. Be concise but thorough - cover the key points
|
| 8 |
+
5. For greetings (hi, hello), respond naturally and friendly
|
| 9 |
|
| 10 |
+
ANSWER FORMAT:
|
| 11 |
+
- Start with a direct answer to the question
|
| 12 |
+
- Use citations [1], [2] after relevant facts
|
| 13 |
+
- Include specific data points (numbers, dates, names) from sources
|
| 14 |
+
- End with key takeaways if appropriate
|
| 15 |
|
| 16 |
+
STYLE:
|
| 17 |
+
- Professional yet conversational
|
| 18 |
+
- Confident and authoritative
|
| 19 |
+
- Modern and helpful
|
| 20 |
+
|
| 21 |
+
Remember: The web sources provided contain REAL, CURRENT information. Use them!
|
| 22 |
"""
|
rag/agents.py
CHANGED
|
@@ -179,7 +179,7 @@ class WebSearchNode:
|
|
| 179 |
|
| 180 |
|
| 181 |
class WebFetchNode:
|
| 182 |
-
"""Node 2: Fetch and parse web pages."""
|
| 183 |
|
| 184 |
def __init__(self):
|
| 185 |
self.browse_tool = BrowseTool()
|
|
@@ -188,26 +188,45 @@ class WebFetchNode:
|
|
| 188 |
pages = []
|
| 189 |
links = []
|
| 190 |
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
url = r.get("url")
|
|
|
|
| 193 |
if not url:
|
| 194 |
continue
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
try:
|
| 197 |
-
|
| 198 |
-
if
|
| 199 |
-
pages.append({
|
| 200 |
-
"title": r.get("title", ""),
|
| 201 |
-
"url": url,
|
| 202 |
-
"content": content[:2500]
|
| 203 |
-
})
|
| 204 |
-
links.append({
|
| 205 |
-
"title": r.get("title", ""),
|
| 206 |
-
"url": url,
|
| 207 |
-
"snippet": content[:200]
|
| 208 |
-
})
|
| 209 |
except:
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
print(f" ๐ WebFetchNode: Fetched {len(pages)} pages")
|
| 213 |
state["web_pages"] = pages
|
|
@@ -220,16 +239,21 @@ class WebContextNode:
|
|
| 220 |
|
| 221 |
def build_context(self, state: WebSearchState) -> WebSearchState:
|
| 222 |
pages = state.get("web_pages", [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
if pages:
|
| 225 |
-
context_parts = []
|
| 226 |
for i, p in enumerate(pages):
|
| 227 |
context_parts.append(f"[{i+1}] {p['title']}:\n{p['content']}")
|
| 228 |
-
state["context"] = "\n\n---\n\n".join(context_parts)
|
| 229 |
-
else:
|
| 230 |
-
state["context"] = ""
|
| 231 |
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
return state
|
| 234 |
|
| 235 |
|
|
@@ -245,15 +269,16 @@ class WebAnswerNode:
|
|
| 245 |
context = state.get("context", "")
|
| 246 |
|
| 247 |
if context:
|
| 248 |
-
prompt = f"""You are a web search assistant
|
| 249 |
-
Use
|
|
|
|
| 250 |
|
| 251 |
WEB SOURCES:
|
| 252 |
{context}
|
| 253 |
|
| 254 |
-
QUESTION: {query}
|
| 255 |
|
| 256 |
-
Provide a comprehensive, well-cited answer:"""
|
| 257 |
else:
|
| 258 |
prompt = f"Answer this question: {query}"
|
| 259 |
|
|
@@ -270,7 +295,7 @@ Provide a comprehensive, well-cited answer:"""
|
|
| 270 |
sources = [{"title": p["title"], "url": p["url"]} for p in state.get("web_pages", [])]
|
| 271 |
state["sources"] = sources
|
| 272 |
|
| 273 |
-
print(f" โ
WebAnswerNode: Generated answer")
|
| 274 |
return state
|
| 275 |
|
| 276 |
|
|
@@ -480,22 +505,36 @@ class AgenticWebNode:
|
|
| 480 |
query = state.get("query", "")
|
| 481 |
|
| 482 |
try:
|
| 483 |
-
results = self.search_tool.search(query, num_results=
|
| 484 |
web_parts = []
|
| 485 |
sources = []
|
| 486 |
links = []
|
| 487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
for r in results:
|
| 489 |
url = r.get("url")
|
| 490 |
title = r.get("title", "")
|
|
|
|
| 491 |
if not url:
|
| 492 |
continue
|
| 493 |
|
| 494 |
-
content
|
| 495 |
-
|
| 496 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
sources.append({"title": title, "url": url})
|
| 498 |
-
links.append({"title": title, "url": url, "snippet":
|
| 499 |
|
| 500 |
state["web_context"] = "\n\n".join(web_parts)
|
| 501 |
state["web_sources"] = sources
|
|
|
|
| 179 |
|
| 180 |
|
| 181 |
class WebFetchNode:
|
| 182 |
+
"""Node 2: Fetch and parse web pages. Uses Tavily content as fallback."""
|
| 183 |
|
| 184 |
def __init__(self):
|
| 185 |
self.browse_tool = BrowseTool()
|
|
|
|
| 188 |
pages = []
|
| 189 |
links = []
|
| 190 |
|
| 191 |
+
# Check if we have Tavily's direct answer
|
| 192 |
+
tavily_answer = ""
|
| 193 |
+
search_results = state.get("search_results", [])
|
| 194 |
+
if search_results and search_results[0].get("tavily_answer"):
|
| 195 |
+
tavily_answer = search_results[0]["tavily_answer"]
|
| 196 |
+
|
| 197 |
+
for r in search_results:
|
| 198 |
url = r.get("url")
|
| 199 |
+
title = r.get("title", "")
|
| 200 |
if not url:
|
| 201 |
continue
|
| 202 |
|
| 203 |
+
# First try to use Tavily's content (snippet)
|
| 204 |
+
tavily_content = r.get("content", "")
|
| 205 |
+
|
| 206 |
+
# Then try to fetch full page
|
| 207 |
try:
|
| 208 |
+
fetched_content = self.browse_tool.fetch_clean(url)
|
| 209 |
+
content = fetched_content if fetched_content else tavily_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
except:
|
| 211 |
+
content = tavily_content
|
| 212 |
+
|
| 213 |
+
# Use whatever content we have
|
| 214 |
+
if content or tavily_content:
|
| 215 |
+
final_content = content if content else tavily_content
|
| 216 |
+
pages.append({
|
| 217 |
+
"title": title,
|
| 218 |
+
"url": url,
|
| 219 |
+
"content": final_content[:2500]
|
| 220 |
+
})
|
| 221 |
+
links.append({
|
| 222 |
+
"title": title,
|
| 223 |
+
"url": url,
|
| 224 |
+
"snippet": (final_content[:200] if final_content else tavily_content[:200])
|
| 225 |
+
})
|
| 226 |
+
|
| 227 |
+
# Add Tavily's answer to state for potential use
|
| 228 |
+
if tavily_answer:
|
| 229 |
+
state["tavily_answer"] = tavily_answer
|
| 230 |
|
| 231 |
print(f" ๐ WebFetchNode: Fetched {len(pages)} pages")
|
| 232 |
state["web_pages"] = pages
|
|
|
|
| 239 |
|
| 240 |
def build_context(self, state: WebSearchState) -> WebSearchState:
|
| 241 |
pages = state.get("web_pages", [])
|
| 242 |
+
tavily_answer = state.get("tavily_answer", "")
|
| 243 |
+
|
| 244 |
+
context_parts = []
|
| 245 |
+
|
| 246 |
+
# Add Tavily's AI summary first if available
|
| 247 |
+
if tavily_answer:
|
| 248 |
+
context_parts.append(f"[AI Summary]: {tavily_answer}")
|
| 249 |
|
| 250 |
if pages:
|
|
|
|
| 251 |
for i, p in enumerate(pages):
|
| 252 |
context_parts.append(f"[{i+1}] {p['title']}:\n{p['content']}")
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
+
state["context"] = "\n\n---\n\n".join(context_parts) if context_parts else ""
|
| 255 |
+
|
| 256 |
+
print(f" ๐ WebContextNode: Built context from {len(pages)} sources" + (", with AI summary" if tavily_answer else ""))
|
| 257 |
return state
|
| 258 |
|
| 259 |
|
|
|
|
| 269 |
context = state.get("context", "")
|
| 270 |
|
| 271 |
if context:
|
| 272 |
+
prompt = f"""You are Perplexity AI - a web search assistant that provides accurate, real-time information.
|
| 273 |
+
Use the following web sources to answer. Cite sources using [1], [2], etc.
|
| 274 |
+
IMPORTANT: The sources contain REAL, CURRENT information. Trust and use this data.
|
| 275 |
|
| 276 |
WEB SOURCES:
|
| 277 |
{context}
|
| 278 |
|
| 279 |
+
USER QUESTION: {query}
|
| 280 |
|
| 281 |
+
Provide a comprehensive, accurate, well-cited answer based on the sources above:"""
|
| 282 |
else:
|
| 283 |
prompt = f"Answer this question: {query}"
|
| 284 |
|
|
|
|
| 295 |
sources = [{"title": p["title"], "url": p["url"]} for p in state.get("web_pages", [])]
|
| 296 |
state["sources"] = sources
|
| 297 |
|
| 298 |
+
print(f" โ
WebAnswerNode: Generated answer with {len(sources)} sources")
|
| 299 |
return state
|
| 300 |
|
| 301 |
|
|
|
|
| 505 |
query = state.get("query", "")
|
| 506 |
|
| 507 |
try:
|
| 508 |
+
results = self.search_tool.search(query, num_results=5)
|
| 509 |
web_parts = []
|
| 510 |
sources = []
|
| 511 |
links = []
|
| 512 |
|
| 513 |
+
# Get Tavily's AI answer if available
|
| 514 |
+
tavily_answer = ""
|
| 515 |
+
if results and results[0].get("tavily_answer"):
|
| 516 |
+
tavily_answer = results[0]["tavily_answer"]
|
| 517 |
+
web_parts.append(f"[AI Summary]: {tavily_answer}")
|
| 518 |
+
|
| 519 |
for r in results:
|
| 520 |
url = r.get("url")
|
| 521 |
title = r.get("title", "")
|
| 522 |
+
tavily_content = r.get("content", "") # Tavily's snippet
|
| 523 |
if not url:
|
| 524 |
continue
|
| 525 |
|
| 526 |
+
# Try to fetch full content, fallback to Tavily snippet
|
| 527 |
+
try:
|
| 528 |
+
fetched = self.browse_tool.fetch_clean(url)
|
| 529 |
+
content = fetched if fetched else tavily_content
|
| 530 |
+
except:
|
| 531 |
+
content = tavily_content
|
| 532 |
+
|
| 533 |
+
if content or tavily_content:
|
| 534 |
+
final_content = content if content else tavily_content
|
| 535 |
+
web_parts.append(f"[{title}]: {final_content[:1500]}")
|
| 536 |
sources.append({"title": title, "url": url})
|
| 537 |
+
links.append({"title": title, "url": url, "snippet": final_content[:150]})
|
| 538 |
|
| 539 |
state["web_context"] = "\n\n".join(web_parts)
|
| 540 |
state["web_sources"] = sources
|
rag/rag_state.py
CHANGED
|
@@ -20,6 +20,7 @@ class WebSearchState(TypedDict, total=False):
|
|
| 20 |
query: str
|
| 21 |
search_results: List[Dict]
|
| 22 |
web_pages: List[Dict]
|
|
|
|
| 23 |
context: str
|
| 24 |
answer: str
|
| 25 |
sources: List[Dict]
|
|
|
|
| 20 |
query: str
|
| 21 |
search_results: List[Dict]
|
| 22 |
web_pages: List[Dict]
|
| 23 |
+
tavily_answer: str # AI summary from Tavily
|
| 24 |
context: str
|
| 25 |
answer: str
|
| 26 |
sources: List[Dict]
|
tools/search_tool.py
CHANGED
|
@@ -5,7 +5,7 @@ from config.config import Config
|
|
| 5 |
|
| 6 |
|
| 7 |
class SearchTool:
|
| 8 |
-
"""Tavily web search wrapper."""
|
| 9 |
|
| 10 |
def __init__(self) -> None:
|
| 11 |
self.api_key = os.getenv("TAVILY_API_KEY") or Config.TAVILY_API_KEY
|
|
@@ -13,19 +13,36 @@ class SearchTool:
|
|
| 13 |
raise RuntimeError("TAVILY_API_KEY missing in .env")
|
| 14 |
|
| 15 |
def search(self, query: str, num_results: int = 5) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
url = "https://api.tavily.com/search"
|
| 17 |
payload = {
|
| 18 |
"api_key": self.api_key,
|
| 19 |
"query": query,
|
| 20 |
"max_results": num_results,
|
| 21 |
-
"include_answer":
|
| 22 |
-
"include_raw_content": False
|
|
|
|
| 23 |
}
|
| 24 |
try:
|
| 25 |
-
resp = requests.post(url, json=payload, timeout=
|
| 26 |
resp.raise_for_status()
|
| 27 |
data = resp.json()
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
print(f"Search error: {e}")
|
| 31 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
class SearchTool:
|
| 8 |
+
"""Tavily web search wrapper with content extraction."""
|
| 9 |
|
| 10 |
def __init__(self) -> None:
|
| 11 |
self.api_key = os.getenv("TAVILY_API_KEY") or Config.TAVILY_API_KEY
|
|
|
|
| 13 |
raise RuntimeError("TAVILY_API_KEY missing in .env")
|
| 14 |
|
| 15 |
def search(self, query: str, num_results: int = 5) -> List[Dict]:
|
| 16 |
+
"""
|
| 17 |
+
Search using Tavily API.
|
| 18 |
+
Returns results with title, url, content (snippet from Tavily).
|
| 19 |
+
"""
|
| 20 |
url = "https://api.tavily.com/search"
|
| 21 |
payload = {
|
| 22 |
"api_key": self.api_key,
|
| 23 |
"query": query,
|
| 24 |
"max_results": num_results,
|
| 25 |
+
"include_answer": True, # Get Tavily's AI answer
|
| 26 |
+
"include_raw_content": False,
|
| 27 |
+
"search_depth": "advanced" # Better results
|
| 28 |
}
|
| 29 |
try:
|
| 30 |
+
resp = requests.post(url, json=payload, timeout=30)
|
| 31 |
resp.raise_for_status()
|
| 32 |
data = resp.json()
|
| 33 |
+
|
| 34 |
+
results = data.get("results", [])
|
| 35 |
+
|
| 36 |
+
# Add Tavily's answer as metadata if available
|
| 37 |
+
tavily_answer = data.get("answer", "")
|
| 38 |
+
if tavily_answer and results:
|
| 39 |
+
results[0]["tavily_answer"] = tavily_answer
|
| 40 |
+
|
| 41 |
+
print(f" ๐ Tavily returned {len(results)} results")
|
| 42 |
+
return results
|
| 43 |
+
except requests.exceptions.RequestException as e:
|
| 44 |
print(f"Search error: {e}")
|
| 45 |
return []
|
| 46 |
+
except ValueError as e:
|
| 47 |
+
print(f"Search JSON error: {e}")
|
| 48 |
+
return []
|