Update extract.py
Browse files- extract.py +16 -1
extract.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
| 1 |
from selenium import webdriver
|
|
|
|
|
|
|
|
|
|
| 2 |
from selenium.common.exceptions import WebDriverException
|
|
|
|
| 3 |
import time
|
| 4 |
import random
|
| 5 |
-
|
|
|
|
| 6 |
|
| 7 |
# Lista de User Agents para rotar
|
| 8 |
user_agents = [
|
|
@@ -28,6 +33,7 @@ user_agents = [
|
|
| 28 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:91.0) Gecko/20100101 Firefox/91.0",
|
| 29 |
]
|
| 30 |
|
|
|
|
| 31 |
def get_random_user_agent():
|
| 32 |
return random.choice(user_agents)
|
| 33 |
|
|
@@ -37,12 +43,20 @@ def get_random_window_size():
|
|
| 37 |
]
|
| 38 |
return random.choice(window_sizes)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def extract_data(user_input, mode):
|
|
|
|
|
|
|
|
|
|
| 41 |
options = webdriver.ChromeOptions()
|
| 42 |
options.add_argument('--headless')
|
| 43 |
options.add_argument('--no-sandbox')
|
| 44 |
options.add_argument('--disable-dev-shm-usage')
|
| 45 |
options.add_argument(f"user-agent={get_random_user_agent()}")
|
|
|
|
| 46 |
|
| 47 |
try:
|
| 48 |
wd = webdriver.Chrome(options=options)
|
|
@@ -60,6 +74,7 @@ def extract_data(user_input, mode):
|
|
| 60 |
page_content = wd.page_source
|
| 61 |
|
| 62 |
except WebDriverException as e:
|
|
|
|
| 63 |
return []
|
| 64 |
finally:
|
| 65 |
if wd:
|
|
|
|
| 1 |
from selenium import webdriver
|
| 2 |
+
from selenium.webdriver.common.by import By
|
| 3 |
+
from selenium.webdriver.common.keys import Keys
|
| 4 |
+
from selenium.webdriver.chrome.service import Service
|
| 5 |
from selenium.common.exceptions import WebDriverException
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
import time
|
| 8 |
import random
|
| 9 |
+
import requests
|
| 10 |
+
from fp.fp import FreeProxy
|
| 11 |
|
| 12 |
# Lista de User Agents para rotar
|
| 13 |
user_agents = [
|
|
|
|
| 33 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:91.0) Gecko/20100101 Firefox/91.0",
|
| 34 |
]
|
| 35 |
|
| 36 |
+
|
| 37 |
def get_random_user_agent():
|
| 38 |
return random.choice(user_agents)
|
| 39 |
|
|
|
|
| 43 |
]
|
| 44 |
return random.choice(window_sizes)
|
| 45 |
|
| 46 |
+
def get_proxy():
|
| 47 |
+
proxy = FreeProxy(rand=True, timeout=1).get()
|
| 48 |
+
return proxy
|
| 49 |
+
|
| 50 |
def extract_data(user_input, mode):
|
| 51 |
+
proxy = get_proxy()
|
| 52 |
+
proxy_url = f"http://{proxy}"
|
| 53 |
+
|
| 54 |
options = webdriver.ChromeOptions()
|
| 55 |
options.add_argument('--headless')
|
| 56 |
options.add_argument('--no-sandbox')
|
| 57 |
options.add_argument('--disable-dev-shm-usage')
|
| 58 |
options.add_argument(f"user-agent={get_random_user_agent()}")
|
| 59 |
+
options.add_argument('--proxy-server=%s' % proxy_url)
|
| 60 |
|
| 61 |
try:
|
| 62 |
wd = webdriver.Chrome(options=options)
|
|
|
|
| 74 |
page_content = wd.page_source
|
| 75 |
|
| 76 |
except WebDriverException as e:
|
| 77 |
+
print(f"Request failed with proxy {proxy_url}. Error: {e}")
|
| 78 |
return []
|
| 79 |
finally:
|
| 80 |
if wd:
|