Spaces:
Sleeping
Sleeping
| from smolagents import Tool | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.chrome.service import Service as ChromeService | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from markdownify import markdownify as md | |
| class ExtractWebContentWithSelenium(Tool): | |
| name = "extract_web_content_selenium" | |
| description = "Visit a webpage and extract the full HTML content of a web page." | |
| inputs = { | |
| "url": { | |
| "type": "string", | |
| "description": "URL of the page to load" | |
| } | |
| } | |
| output_type = "string" | |
| def forward(self, url: str) -> str: | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") | |
| chrome_options.add_argument("--no-sandbox") | |
| chrome_options.add_argument("--disable-dev-shm-usage") | |
| chrome_options.add_argument( | |
| "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/114.0.0.0 Safari/537.36" | |
| ) | |
| # Installe automatiquement ChromeDriver | |
| driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) | |
| try: | |
| driver.get(url) | |
| page_content = driver.page_source | |
| markdown = md(page_content, heading_style="ATX") | |
| finally: | |
| driver.quit() | |
| return markdown | |