Naveen-2007 commited on
Commit
6c86ca6
ยท
1 Parent(s): 93e0b15

Major fix: Improved web search with Tavily AI answers, better content extraction, enhanced system prompt

Browse files
config/system_prompt.py CHANGED
@@ -1,11 +1,22 @@
1
- PPLX_SYSTEM_PROMPT = """
2
- You are Perplexity AI.
3
 
4
- When user greets (hi, hello, hey), respond like a friendly assistant:
5
- Short, conversational, natural.
 
 
 
 
6
 
7
- Do NOT give definitions or grammar explanations unless user asks.
8
- Your tone: concise, helpful, modern.
 
 
 
9
 
10
- Always adapt style based on question.
 
 
 
 
 
11
  """
 
1
+ PPLX_SYSTEM_PROMPT = """You are Perplexity AI - a powerful AI search assistant that provides accurate, real-time information.
 
2
 
3
+ CORE BEHAVIORS:
4
+ 1. When given web sources, ALWAYS use and cite them using [1], [2], etc.
5
+ 2. NEVER say "I don't have access to real-time data" when sources are provided
6
+ 3. Synthesize information from multiple sources into clear, comprehensive answers
7
+ 4. Be concise but thorough - cover the key points
8
+ 5. For greetings (hi, hello), respond naturally and friendly
9
 
10
+ ANSWER FORMAT:
11
+ - Start with a direct answer to the question
12
+ - Use citations [1], [2] after relevant facts
13
+ - Include specific data points (numbers, dates, names) from sources
14
+ - End with key takeaways if appropriate
15
 
16
+ STYLE:
17
+ - Professional yet conversational
18
+ - Confident and authoritative
19
+ - Modern and helpful
20
+
21
+ Remember: The web sources provided contain REAL, CURRENT information. Use them!
22
  """
rag/agents.py CHANGED
@@ -179,7 +179,7 @@ class WebSearchNode:
179
 
180
 
181
  class WebFetchNode:
182
- """Node 2: Fetch and parse web pages."""
183
 
184
  def __init__(self):
185
  self.browse_tool = BrowseTool()
@@ -188,26 +188,45 @@ class WebFetchNode:
188
  pages = []
189
  links = []
190
 
191
- for r in state.get("search_results", []):
 
 
 
 
 
 
192
  url = r.get("url")
 
193
  if not url:
194
  continue
195
 
 
 
 
 
196
  try:
197
- content = self.browse_tool.fetch_clean(url)
198
- if content:
199
- pages.append({
200
- "title": r.get("title", ""),
201
- "url": url,
202
- "content": content[:2500]
203
- })
204
- links.append({
205
- "title": r.get("title", ""),
206
- "url": url,
207
- "snippet": content[:200]
208
- })
209
  except:
210
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  print(f" ๐Ÿ“„ WebFetchNode: Fetched {len(pages)} pages")
213
  state["web_pages"] = pages
@@ -220,16 +239,21 @@ class WebContextNode:
220
 
221
  def build_context(self, state: WebSearchState) -> WebSearchState:
222
  pages = state.get("web_pages", [])
 
 
 
 
 
 
 
223
 
224
  if pages:
225
- context_parts = []
226
  for i, p in enumerate(pages):
227
  context_parts.append(f"[{i+1}] {p['title']}:\n{p['content']}")
228
- state["context"] = "\n\n---\n\n".join(context_parts)
229
- else:
230
- state["context"] = ""
231
 
232
- print(f" ๐Ÿ“ WebContextNode: Built context from {len(pages)} sources")
 
 
233
  return state
234
 
235
 
@@ -245,15 +269,16 @@ class WebAnswerNode:
245
  context = state.get("context", "")
246
 
247
  if context:
248
- prompt = f"""You are a web search assistant like Perplexity AI.
249
- Use ONLY the following web sources to answer. Cite sources using [1], [2], etc.
 
250
 
251
  WEB SOURCES:
252
  {context}
253
 
254
- QUESTION: {query}
255
 
256
- Provide a comprehensive, well-cited answer:"""
257
  else:
258
  prompt = f"Answer this question: {query}"
259
 
@@ -270,7 +295,7 @@ Provide a comprehensive, well-cited answer:"""
270
  sources = [{"title": p["title"], "url": p["url"]} for p in state.get("web_pages", [])]
271
  state["sources"] = sources
272
 
273
- print(f" โœ… WebAnswerNode: Generated answer")
274
  return state
275
 
276
 
@@ -480,22 +505,36 @@ class AgenticWebNode:
480
  query = state.get("query", "")
481
 
482
  try:
483
- results = self.search_tool.search(query, num_results=4)
484
  web_parts = []
485
  sources = []
486
  links = []
487
 
 
 
 
 
 
 
488
  for r in results:
489
  url = r.get("url")
490
  title = r.get("title", "")
 
491
  if not url:
492
  continue
493
 
494
- content = self.browse_tool.fetch_clean(url)
495
- if content:
496
- web_parts.append(f"[{title}]: {content[:1500]}")
 
 
 
 
 
 
 
497
  sources.append({"title": title, "url": url})
498
- links.append({"title": title, "url": url, "snippet": content[:150]})
499
 
500
  state["web_context"] = "\n\n".join(web_parts)
501
  state["web_sources"] = sources
 
179
 
180
 
181
  class WebFetchNode:
182
+ """Node 2: Fetch and parse web pages. Uses Tavily content as fallback."""
183
 
184
  def __init__(self):
185
  self.browse_tool = BrowseTool()
 
188
  pages = []
189
  links = []
190
 
191
+ # Check if we have Tavily's direct answer
192
+ tavily_answer = ""
193
+ search_results = state.get("search_results", [])
194
+ if search_results and search_results[0].get("tavily_answer"):
195
+ tavily_answer = search_results[0]["tavily_answer"]
196
+
197
+ for r in search_results:
198
  url = r.get("url")
199
+ title = r.get("title", "")
200
  if not url:
201
  continue
202
 
203
+ # First try to use Tavily's content (snippet)
204
+ tavily_content = r.get("content", "")
205
+
206
+ # Then try to fetch full page
207
  try:
208
+ fetched_content = self.browse_tool.fetch_clean(url)
209
+ content = fetched_content if fetched_content else tavily_content
 
 
 
 
 
 
 
 
 
 
210
  except:
211
+ content = tavily_content
212
+
213
+ # Use whatever content we have
214
+ if content or tavily_content:
215
+ final_content = content if content else tavily_content
216
+ pages.append({
217
+ "title": title,
218
+ "url": url,
219
+ "content": final_content[:2500]
220
+ })
221
+ links.append({
222
+ "title": title,
223
+ "url": url,
224
+ "snippet": (final_content[:200] if final_content else tavily_content[:200])
225
+ })
226
+
227
+ # Add Tavily's answer to state for potential use
228
+ if tavily_answer:
229
+ state["tavily_answer"] = tavily_answer
230
 
231
  print(f" ๐Ÿ“„ WebFetchNode: Fetched {len(pages)} pages")
232
  state["web_pages"] = pages
 
239
 
240
  def build_context(self, state: WebSearchState) -> WebSearchState:
241
  pages = state.get("web_pages", [])
242
+ tavily_answer = state.get("tavily_answer", "")
243
+
244
+ context_parts = []
245
+
246
+ # Add Tavily's AI summary first if available
247
+ if tavily_answer:
248
+ context_parts.append(f"[AI Summary]: {tavily_answer}")
249
 
250
  if pages:
 
251
  for i, p in enumerate(pages):
252
  context_parts.append(f"[{i+1}] {p['title']}:\n{p['content']}")
 
 
 
253
 
254
+ state["context"] = "\n\n---\n\n".join(context_parts) if context_parts else ""
255
+
256
+ print(f" ๐Ÿ“ WebContextNode: Built context from {len(pages)} sources" + (", with AI summary" if tavily_answer else ""))
257
  return state
258
 
259
 
 
269
  context = state.get("context", "")
270
 
271
  if context:
272
+ prompt = f"""You are Perplexity AI - a web search assistant that provides accurate, real-time information.
273
+ Use the following web sources to answer. Cite sources using [1], [2], etc.
274
+ IMPORTANT: The sources contain REAL, CURRENT information. Trust and use this data.
275
 
276
  WEB SOURCES:
277
  {context}
278
 
279
+ USER QUESTION: {query}
280
 
281
+ Provide a comprehensive, accurate, well-cited answer based on the sources above:"""
282
  else:
283
  prompt = f"Answer this question: {query}"
284
 
 
295
  sources = [{"title": p["title"], "url": p["url"]} for p in state.get("web_pages", [])]
296
  state["sources"] = sources
297
 
298
+ print(f" โœ… WebAnswerNode: Generated answer with {len(sources)} sources")
299
  return state
300
 
301
 
 
505
  query = state.get("query", "")
506
 
507
  try:
508
+ results = self.search_tool.search(query, num_results=5)
509
  web_parts = []
510
  sources = []
511
  links = []
512
 
513
+ # Get Tavily's AI answer if available
514
+ tavily_answer = ""
515
+ if results and results[0].get("tavily_answer"):
516
+ tavily_answer = results[0]["tavily_answer"]
517
+ web_parts.append(f"[AI Summary]: {tavily_answer}")
518
+
519
  for r in results:
520
  url = r.get("url")
521
  title = r.get("title", "")
522
+ tavily_content = r.get("content", "") # Tavily's snippet
523
  if not url:
524
  continue
525
 
526
+ # Try to fetch full content, fallback to Tavily snippet
527
+ try:
528
+ fetched = self.browse_tool.fetch_clean(url)
529
+ content = fetched if fetched else tavily_content
530
+ except:
531
+ content = tavily_content
532
+
533
+ if content or tavily_content:
534
+ final_content = content if content else tavily_content
535
+ web_parts.append(f"[{title}]: {final_content[:1500]}")
536
  sources.append({"title": title, "url": url})
537
+ links.append({"title": title, "url": url, "snippet": final_content[:150]})
538
 
539
  state["web_context"] = "\n\n".join(web_parts)
540
  state["web_sources"] = sources
rag/rag_state.py CHANGED
@@ -20,6 +20,7 @@ class WebSearchState(TypedDict, total=False):
20
  query: str
21
  search_results: List[Dict]
22
  web_pages: List[Dict]
 
23
  context: str
24
  answer: str
25
  sources: List[Dict]
 
20
  query: str
21
  search_results: List[Dict]
22
  web_pages: List[Dict]
23
+ tavily_answer: str # AI summary from Tavily
24
  context: str
25
  answer: str
26
  sources: List[Dict]
tools/search_tool.py CHANGED
@@ -5,7 +5,7 @@ from config.config import Config
5
 
6
 
7
  class SearchTool:
8
- """Tavily web search wrapper."""
9
 
10
  def __init__(self) -> None:
11
  self.api_key = os.getenv("TAVILY_API_KEY") or Config.TAVILY_API_KEY
@@ -13,19 +13,36 @@ class SearchTool:
13
  raise RuntimeError("TAVILY_API_KEY missing in .env")
14
 
15
  def search(self, query: str, num_results: int = 5) -> List[Dict]:
 
 
 
 
16
  url = "https://api.tavily.com/search"
17
  payload = {
18
  "api_key": self.api_key,
19
  "query": query,
20
  "max_results": num_results,
21
- "include_answer": False,
22
- "include_raw_content": False
 
23
  }
24
  try:
25
- resp = requests.post(url, json=payload, timeout=20)
26
  resp.raise_for_status()
27
  data = resp.json()
28
- return data.get("results", [])
29
- except (requests.exceptions.RequestException, ValueError) as e:
 
 
 
 
 
 
 
 
 
30
  print(f"Search error: {e}")
31
  return []
 
 
 
 
5
 
6
 
7
  class SearchTool:
8
+ """Tavily web search wrapper with content extraction."""
9
 
10
  def __init__(self) -> None:
11
  self.api_key = os.getenv("TAVILY_API_KEY") or Config.TAVILY_API_KEY
 
13
  raise RuntimeError("TAVILY_API_KEY missing in .env")
14
 
15
  def search(self, query: str, num_results: int = 5) -> List[Dict]:
16
+ """
17
+ Search using Tavily API.
18
+ Returns results with title, url, content (snippet from Tavily).
19
+ """
20
  url = "https://api.tavily.com/search"
21
  payload = {
22
  "api_key": self.api_key,
23
  "query": query,
24
  "max_results": num_results,
25
+ "include_answer": True, # Get Tavily's AI answer
26
+ "include_raw_content": False,
27
+ "search_depth": "advanced" # Better results
28
  }
29
  try:
30
+ resp = requests.post(url, json=payload, timeout=30)
31
  resp.raise_for_status()
32
  data = resp.json()
33
+
34
+ results = data.get("results", [])
35
+
36
+ # Add Tavily's answer as metadata if available
37
+ tavily_answer = data.get("answer", "")
38
+ if tavily_answer and results:
39
+ results[0]["tavily_answer"] = tavily_answer
40
+
41
+ print(f" ๐Ÿ” Tavily returned {len(results)} results")
42
+ return results
43
+ except requests.exceptions.RequestException as e:
44
  print(f"Search error: {e}")
45
  return []
46
+ except ValueError as e:
47
+ print(f"Search JSON error: {e}")
48
+ return []