serverdaun commited on
Commit
4374e3a
·
1 Parent(s): 67a532b

update wiki_search and add scrape_webpage tool

Browse files
Files changed (1) hide show
  1. tools.py +76 -16
tools.py CHANGED
@@ -3,6 +3,8 @@ from langchain_community.tools.tavily_search import TavilySearchResults
3
  from langchain_community.document_loaders import WikipediaLoader
4
  from langchain_community.document_loaders import ArxivLoader
5
  from config import TAVILY_API_KEY
 
 
6
 
7
 
8
  #=========================================
@@ -11,23 +13,57 @@ from config import TAVILY_API_KEY
11
  @tool
12
  def wiki_search(query: str) -> str:
13
  """
14
- Search Wikipedia for a given query and return top 3 results.
15
  Args:
16
  query (str): The search query.
17
  Returns:
18
- str: Formatted string containing the titles, URLs and content of the top 3 Wikipedia articles.
19
- """
20
- docs = WikipediaLoader(query=query, load_max_docs=3).load()
21
-
22
- # Format the results
23
- formatted_results = "\n\n\n--------------\n\n\n".join(
24
- [
25
- f"*Metadata*:\nTitle: {doc.metadata.get('title')}\nURL: {doc.metadata.get('source')}\n\n"
26
- f"*Content*:\n{doc.page_content}"
27
- for doc in docs
28
- ]
29
- )
30
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  return formatted_results
32
 
33
  @tool
@@ -39,7 +75,7 @@ def tavily_search(query: str) -> str:
39
  Returns:
40
  str: Formatted string containing the titles, URLs and content of the top 3 Tavily search results.
41
  """
42
- results = TavilySearchResults(max_results=3, tavily_api_key=TAVILY_API_KEY).invoke({"query": query})
43
 
44
  # Format the results
45
  formatted_results = "\n\n\n--------------\n\n\n".join(
@@ -61,7 +97,7 @@ def arxiv_search(query: str) -> str:
61
  Returns:
62
  str: Formatted string containing the titles, URLs and content of the top 3 Arxiv search results.
63
  """
64
- docs = ArxivLoader(query=query, load_max_docs=3).load()
65
 
66
  # Format the results
67
  formatted_results = "\n\n\n--------------\n\n\n".join(
@@ -74,6 +110,30 @@ def arxiv_search(query: str) -> str:
74
 
75
  return formatted_results
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  #=========================================
79
  # Math Tools
 
3
  from langchain_community.document_loaders import WikipediaLoader
4
  from langchain_community.document_loaders import ArxivLoader
5
  from config import TAVILY_API_KEY
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
 
9
 
10
  #=========================================
 
13
  @tool
14
  def wiki_search(query: str) -> str:
15
  """
16
+ Search Wikipedia for a given query, return top 3 results and scrape full content.
17
  Args:
18
  query (str): The search query.
19
  Returns:
20
+ str: Formatted string containing the titles, URLs, content snippets and full webpage content of the top 3 Wikipedia articles.
21
+ """
22
+ docs = WikipediaLoader(query=query, load_max_docs=2).load()
23
+
24
+ results = []
25
+ for doc in docs:
26
+ # Get the standard wiki summary
27
+ wiki_summary = f"\nTitle: {doc.metadata.get('title')}\nURL: {doc.metadata.get('source')}\n\n"
28
+
29
+ # Scrape and clean the full webpage
30
+ try:
31
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
32
+ response = requests.get(doc.metadata.get('source'), headers=headers)
33
+ response.raise_for_status()
34
+ soup = BeautifulSoup(response.text, 'html.parser')
35
+
36
+ # Remove unwanted elements
37
+ unwanted_elements = [
38
+ '.mw-jump-link', '.mw-editsection', '.reference', # Wiki specific
39
+ '#mw-navigation', '#mw-head', '#mw-panel', # Navigation
40
+ '.navbox', '.vertical-navbox', '.sidebar', # Navigation boxes
41
+ '.noprint', '.printfooter', '.catlinks', # Printing related
42
+ '#toc', '.toc', '#site-navigation', # Table of contents
43
+ ]
44
+ for element in soup.select(','.join(unwanted_elements)):
45
+ element.decompose()
46
+
47
+ # Get main content area
48
+ content_div = soup.select_one('#mw-content-text')
49
+ if content_div:
50
+ # Remove disambiguation elements if present
51
+ for disambig in content_div.select('.hatnote, .dmbox-disambig'):
52
+ disambig.decompose()
53
+ full_text = content_div.get_text(separator='\n', strip=True)
54
+ else:
55
+ full_text = soup.get_text(separator='\n', strip=True)
56
+
57
+
58
+ # Combine wiki summary with cleaned webpage content
59
+ combined_result = f"{wiki_summary}\n### Full Article Content ###\n{full_text}"
60
+ results.append(combined_result)
61
+
62
+ except Exception as e:
63
+ results.append(wiki_summary)
64
+
65
+ # Join all results with clear separators
66
+ formatted_results = "\n\n" + "="*20 + "\n\n".join(results)
67
  return formatted_results
68
 
69
  @tool
 
75
  Returns:
76
  str: Formatted string containing the titles, URLs and content of the top 3 Tavily search results.
77
  """
78
+ results = TavilySearchResults(max_results=5, tavily_api_key=TAVILY_API_KEY).invoke({"query": query})
79
 
80
  # Format the results
81
  formatted_results = "\n\n\n--------------\n\n\n".join(
 
97
  Returns:
98
  str: Formatted string containing the titles, URLs and content of the top 3 Arxiv search results.
99
  """
100
+ docs = ArxivLoader(query=query, load_max_docs=5).load()
101
 
102
  # Format the results
103
  formatted_results = "\n\n\n--------------\n\n\n".join(
 
110
 
111
  return formatted_results
112
 
113
+ @tool
114
+ def scrape_webpage(url: str) -> str:
115
+ """
116
+ Scrape the main content from a webpage.
117
+ Args:
118
+ url (str): The URL of the webpage to scrape.
119
+ Returns:
120
+ str: The main text content of the webpage.
121
+ """
122
+ try:
123
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
124
+ response = requests.get(url, headers=headers)
125
+ response.raise_for_status()
126
+ soup = BeautifulSoup(response.text, 'html.parser')
127
+
128
+ # Remove script and style elements
129
+ for script in soup(['script', 'style']):
130
+ script.decompose()
131
+
132
+ # Get text content
133
+ text = soup.get_text(separator='\n', strip=True)
134
+ return text
135
+ except Exception as e:
136
+ return f"Error scraping webpage: {str(e)}"
137
 
138
  #=========================================
139
  # Math Tools