MainStreet123 commited on
Commit
9ef3450
·
verified ·
1 Parent(s): 48cfa06

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -5
app.py CHANGED
@@ -3,7 +3,6 @@ import re
3
  import gradio as gr
4
  import requests
5
  import pandas as pd
6
- from bs4 import BeautifulSoup
7
 
8
  try:
9
  from dotenv import load_dotenv
@@ -42,10 +41,18 @@ def tool_web_page_view(url: str) -> str:
42
  headers = {"User-Agent": "Mozilla/5.0 (compatible; ReActAgent/1.0)"}
43
  r = requests.get(url, timeout=15, headers=headers)
44
  r.raise_for_status()
45
- soup = BeautifulSoup(r.text, "html.parser")
46
- for tag in soup(["script", "style", "nav", "footer", "header"]):
47
- tag.decompose()
48
- text = soup.get_text(separator="\n", strip=True)
 
 
 
 
 
 
 
 
49
  return text[:8000] if len(text) > 8000 else text or "No text content found."
50
  except Exception as e:
51
  return f"Web page view error: {e}"
 
3
  import gradio as gr
4
  import requests
5
  import pandas as pd
 
6
 
7
  try:
8
  from dotenv import load_dotenv
 
41
  headers = {"User-Agent": "Mozilla/5.0 (compatible; ReActAgent/1.0)"}
42
  r = requests.get(url, timeout=15, headers=headers)
43
  r.raise_for_status()
44
+ html = r.text
45
+ try:
46
+ from bs4 import BeautifulSoup
47
+ soup = BeautifulSoup(html, "html.parser")
48
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
49
+ tag.decompose()
50
+ text = soup.get_text(separator="\n", strip=True)
51
+ except ImportError:
52
+ text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
53
+ text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
54
+ text = re.sub(r"<[^>]+>", " ", text)
55
+ text = re.sub(r"\s+", " ", text).strip()
56
  return text[:8000] if len(text) > 8000 else text or "No text content found."
57
  except Exception as e:
58
  return f"Web page view error: {e}"