Spaces:
Sleeping
Sleeping
Update application/utils/web_search.py
Browse files
application/utils/web_search.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
from bs4 import BeautifulSoup
|
|
|
|
| 3 |
import re
|
| 4 |
-
from duckduckgo_search import DDGS
|
| 5 |
|
| 6 |
class WebScarper:
|
| 7 |
def __init__(self):
|
|
@@ -19,16 +20,15 @@ class WebScarper:
|
|
| 19 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
| 20 |
}
|
| 21 |
response = requests.get(url, headers=headers, timeout=10)
|
| 22 |
-
response.raise_for_status()
|
| 23 |
return response.text
|
| 24 |
except requests.exceptions.RequestException as e:
|
| 25 |
print(f"Error fetching URL {url}: {e}")
|
| 26 |
return None
|
| 27 |
-
|
| 28 |
def get_text(self, data):
|
| 29 |
soup = BeautifulSoup(data, 'html.parser')
|
| 30 |
text = soup.get_text()
|
| 31 |
-
cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
| 32 |
return cleaned_text[:4000] if len(cleaned_text) > 4000 else cleaned_text
|
| 33 |
|
| 34 |
def scarpe(self, query):
|
|
|
|
| 1 |
+
# application/utils/web_search.py
|
| 2 |
+
from duckduckgo_search import DDGS # Simpler import
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
+
import requests
|
| 5 |
import re
|
|
|
|
| 6 |
|
| 7 |
class WebScarper:
|
| 8 |
def __init__(self):
|
|
|
|
| 20 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
| 21 |
}
|
| 22 |
response = requests.get(url, headers=headers, timeout=10)
|
| 23 |
+
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
| 24 |
return response.text
|
| 25 |
except requests.exceptions.RequestException as e:
|
| 26 |
print(f"Error fetching URL {url}: {e}")
|
| 27 |
return None
|
|
|
|
| 28 |
def get_text(self, data):
|
| 29 |
soup = BeautifulSoup(data, 'html.parser')
|
| 30 |
text = soup.get_text()
|
| 31 |
+
cleaned_text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
|
| 32 |
return cleaned_text[:4000] if len(cleaned_text) > 4000 else cleaned_text
|
| 33 |
|
| 34 |
def scarpe(self, query):
|