GLECO commited on
Commit
6a4320d
·
1 Parent(s): 43fcb87

Ajout tool pour webpage

Browse files
Files changed (1) hide show
  1. visit_web_pages_tool.py +50 -0
visit_web_pages_tool.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from markdownify import markdownify
4
+ from requests.exceptions import RequestException
5
+ from langchain_core.tools import tool
6
+ import requests
7
+ from langchain_community.tools import DuckDuckGoSearchResults
8
+
9
+ DEFAULT_HEADERS = {
10
+ # Generic, browser-like UA. For Wikipedia, better to identify your app & contact.
11
+ "User-Agent": (
12
+ "Mozilla/5.0 (X11; Linux x86_64) "
13
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
14
+ "Chrome/120.0 Safari/537.36"
15
+ ),
16
+ "Accept": (
17
+ "text/html,application/xhtml+xml,application/xml;"
18
+ "q=0.9,image/avif,image/webp,*/*;q=0.8"
19
+ ),
20
+ "Accept-Language": "en-US,en;q=0.5",
21
+ "Accept-Encoding": "gzip, deflate, br",
22
+ }
23
+
24
+ def visit_webpage(url: str) -> str:
25
+ """Visits a webpage at the given URL and returns its content as a markdown string.
26
+
27
+ Args:
28
+ url: The URL of the webpage to visit.
29
+
30
+ Returns:
31
+ The content of the webpage converted to Markdown, or an error message if the request fails.
32
+ """
33
+ try:
34
+ # Send a GET request to the URL
35
+ response = requests.get("https://urltomarkdown.herokuapp.com/?url=" + url)
36
+ #print(response.text)
37
+ response.raise_for_status() # Raise an exception for bad status codes
38
+
39
+ # Convert the HTML content to Markdown
40
+ markdown_content = markdownify(response.text).strip()
41
+
42
+ # Remove multiple line breaks
43
+ markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
44
+
45
+ return markdown_content
46
+
47
+ except RequestException as e:
48
+ return f"Error fetching the webpage: {str(e)}"
49
+ except Exception as e:
50
+ return f"An unexpected error occurred: {str(e)}"