Spaces:
Paused
Paused
Commit ·
bcae708
1
Parent(s): f58ccf2
replaced chromedriver/selenium with zenrows
Browse files- app.py +2 -2
- requirements.txt +3 -2
- web_search.py +32 -29
app.py
CHANGED
|
@@ -110,8 +110,8 @@ if __name__ == '__main__':
|
|
| 110 |
# libnss3=2:3.26.2-1.1+deb9u1 \
|
| 111 |
# libgconf-2-4=3.2.6-4+b1 \
|
| 112 |
# libfontconfig1=2.11.0-6.7+b1
|
| 113 |
-
check_call(['apt-get', 'install', '-y', 'libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1'],
|
| 114 |
-
|
| 115 |
|
| 116 |
logger.info("Launching Gradio ChatInterface for searchbot...")
|
| 117 |
|
|
|
|
| 110 |
# libnss3=2:3.26.2-1.1+deb9u1 \
|
| 111 |
# libgconf-2-4=3.2.6-4+b1 \
|
| 112 |
# libfontconfig1=2.11.0-6.7+b1
|
| 113 |
+
# check_call(['apt-get', 'install', '-y', 'libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1'],
|
| 114 |
+
# stdout=open(os.devnull, 'wb'), stderr=STDOUT)
|
| 115 |
|
| 116 |
logger.info("Launching Gradio ChatInterface for searchbot...")
|
| 117 |
|
requirements.txt
CHANGED
|
@@ -8,8 +8,9 @@ llama-index-embeddings-openai
|
|
| 8 |
llama-index-llms-openai
|
| 9 |
# needed for simpledirectoryreader to work
|
| 10 |
llama-index-readers-file
|
| 11 |
-
selenium==4.22.0
|
| 12 |
unstructured
|
| 13 |
requests
|
| 14 |
-
chromium
|
|
|
|
| 15 |
|
|
|
|
| 8 |
llama-index-llms-openai
|
| 9 |
# needed for simpledirectoryreader to work
|
| 10 |
llama-index-readers-file
|
| 11 |
+
# selenium==4.22.0
|
| 12 |
unstructured
|
| 13 |
requests
|
| 14 |
+
# chromium
|
| 15 |
+
zenrows
|
| 16 |
|
web_search.py
CHANGED
|
@@ -2,7 +2,6 @@ import copy
|
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
import os
|
| 5 |
-
import stat
|
| 6 |
import time
|
| 7 |
import traceback
|
| 8 |
import urllib.parse as en
|
|
@@ -10,11 +9,8 @@ import warnings
|
|
| 10 |
from itertools import zip_longest
|
| 11 |
|
| 12 |
import requests
|
| 13 |
-
import selenium.common.exceptions
|
| 14 |
-
from selenium import webdriver
|
| 15 |
-
from selenium.webdriver.chrome.options import Options
|
| 16 |
-
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 17 |
from unstructured.partition.html import partition_html
|
|
|
|
| 18 |
|
| 19 |
from llmsearch import site_stats
|
| 20 |
# this import style works in pycharm
|
|
@@ -27,6 +23,7 @@ from llmsearch import utilityV2 as ut
|
|
| 27 |
# from llmsearch import utilityV2 as ut
|
| 28 |
|
| 29 |
logger = logging.getLogger("agent_logger")
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
# todo drop blocked pages > see og llmsearch code
|
|
@@ -68,39 +65,45 @@ def process_url(url, timeout):
|
|
| 68 |
try:
|
| 69 |
with warnings.catch_warnings():
|
| 70 |
warnings.simplefilter("ignore")
|
| 71 |
-
options = Options()
|
| 72 |
-
options.page_load_strategy = "eager"
|
| 73 |
-
options.add_argument("--headless")
|
| 74 |
-
options.add_argument("--no-sandbox")
|
| 75 |
-
options.add_argument("--disable-dev-shm-usage")
|
| 76 |
-
|
| 77 |
-
options.add_argument("start-maximized")
|
| 78 |
-
options.add_argument("disable-infobars")
|
| 79 |
-
options.add_argument("--disable-extensions")
|
| 80 |
-
options.add_argument("--disable-gpu")
|
| 81 |
-
options.add_argument("--disable-dev-shm-usage")
|
| 82 |
result = ""
|
| 83 |
# make driver exec
|
| 84 |
-
os.chmod('chromedriver-linux64/chromedriver', stat.S_IEXEC)
|
| 85 |
try:
|
| 86 |
-
driver = webdriver.Chrome(service=ChromeService(executable_path='chromedriver-linux64/chromedriver'),
|
| 87 |
-
|
| 88 |
-
logger.info(f"*****setting page load timeout {timeout}")
|
| 89 |
-
driver.set_page_load_timeout(timeout)
|
| 90 |
-
driver.get(url)
|
| 91 |
-
response = driver.page_source
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
| 96 |
traceback.print_exc()
|
| 97 |
-
logger.info(f"webdriver failed to load")
|
| 98 |
return "", url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
except Exception:
|
| 100 |
traceback.print_exc()
|
| 101 |
logger.info(f"{site} err")
|
| 102 |
pass
|
| 103 |
-
logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
| 104 |
return result, url
|
| 105 |
|
| 106 |
|
|
|
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
import os
|
|
|
|
| 5 |
import time
|
| 6 |
import traceback
|
| 7 |
import urllib.parse as en
|
|
|
|
| 9 |
from itertools import zip_longest
|
| 10 |
|
| 11 |
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from unstructured.partition.html import partition_html
|
| 13 |
+
from zenrows import ZenRowsClient
|
| 14 |
|
| 15 |
from llmsearch import site_stats
|
| 16 |
# this import style works in pycharm
|
|
|
|
| 23 |
# from llmsearch import utilityV2 as ut
|
| 24 |
|
| 25 |
logger = logging.getLogger("agent_logger")
|
| 26 |
+
logger = logging.getLogger("agent_logger")
|
| 27 |
|
| 28 |
|
| 29 |
# todo drop blocked pages > see og llmsearch code
|
|
|
|
| 65 |
try:
|
| 66 |
with warnings.catch_warnings():
|
| 67 |
warnings.simplefilter("ignore")
|
| 68 |
+
# options = Options()
|
| 69 |
+
# options.page_load_strategy = "eager"
|
| 70 |
+
# options.add_argument("--headless")
|
| 71 |
+
# options.add_argument("--no-sandbox")
|
| 72 |
+
# options.add_argument("--disable-dev-shm-usage")
|
| 73 |
+
#
|
| 74 |
+
# options.add_argument("start-maximized")
|
| 75 |
+
# options.add_argument("disable-infobars")
|
| 76 |
+
# options.add_argument("--disable-extensions")
|
| 77 |
+
# options.add_argument("--disable-gpu")
|
| 78 |
+
# options.add_argument("--disable-dev-shm-usage")
|
| 79 |
result = ""
|
| 80 |
# make driver exec
|
| 81 |
+
# os.chmod('chromedriver-linux64/chromedriver', stat.S_IEXEC)
|
| 82 |
try:
|
| 83 |
+
# driver = webdriver.Chrome(service=ChromeService(executable_path='chromedriver-linux64/chromedriver'),
|
| 84 |
+
# options=options)
|
| 85 |
+
# logger.info(f"*****setting page load timeout {timeout}")
|
| 86 |
+
# driver.set_page_load_timeout(timeout)
|
| 87 |
+
# driver.get(url)
|
| 88 |
+
# response = driver.page_source
|
| 89 |
+
client = ZenRowsClient(os.getenv('zenrows_api_key'))
|
| 90 |
+
response = client.get(url)
|
| 91 |
+
# result = response_text_extract(url=url, response=response)
|
| 92 |
+
result = response.text
|
| 93 |
+
except Exception:
|
| 94 |
traceback.print_exc()
|
|
|
|
| 95 |
return "", url
|
| 96 |
+
# except selenium.common.exceptions.TimeoutException:
|
| 97 |
+
# return "", url
|
| 98 |
+
# except selenium.common.exceptions.WebDriverException:
|
| 99 |
+
# traceback.print_exc()
|
| 100 |
+
# logger.info(f"webdriver failed to load")
|
| 101 |
+
# return "", url
|
| 102 |
except Exception:
|
| 103 |
traceback.print_exc()
|
| 104 |
logger.info(f"{site} err")
|
| 105 |
pass
|
| 106 |
+
logger.info(f"Processed {site}: {len(response.text)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
| 107 |
return result, url
|
| 108 |
|
| 109 |
|