Update app.py
Browse files
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import re
|
|
| 3 |
import gradio as gr
|
| 4 |
import requests
|
| 5 |
import pandas as pd
|
| 6 |
-
from bs4 import BeautifulSoup
|
| 7 |
|
| 8 |
try:
|
| 9 |
from dotenv import load_dotenv
|
|
@@ -42,10 +41,18 @@ def tool_web_page_view(url: str) -> str:
|
|
| 42 |
headers = {"User-Agent": "Mozilla/5.0 (compatible; ReActAgent/1.0)"}
|
| 43 |
r = requests.get(url, timeout=15, headers=headers)
|
| 44 |
r.raise_for_status()
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
return text[:8000] if len(text) > 8000 else text or "No text content found."
|
| 50 |
except Exception as e:
|
| 51 |
return f"Web page view error: {e}"
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import requests
|
| 5 |
import pandas as pd
|
|
|
|
| 6 |
|
| 7 |
try:
|
| 8 |
from dotenv import load_dotenv
|
|
|
|
| 41 |
headers = {"User-Agent": "Mozilla/5.0 (compatible; ReActAgent/1.0)"}
|
| 42 |
r = requests.get(url, timeout=15, headers=headers)
|
| 43 |
r.raise_for_status()
|
| 44 |
+
html = r.text
|
| 45 |
+
try:
|
| 46 |
+
from bs4 import BeautifulSoup
|
| 47 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 48 |
+
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
| 49 |
+
tag.decompose()
|
| 50 |
+
text = soup.get_text(separator="\n", strip=True)
|
| 51 |
+
except ImportError:
|
| 52 |
+
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
| 53 |
+
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
|
| 54 |
+
text = re.sub(r"<[^>]+>", " ", text)
|
| 55 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 56 |
return text[:8000] if len(text) > 8000 else text or "No text content found."
|
| 57 |
except Exception as e:
|
| 58 |
return f"Web page view error: {e}"
|