Update extract.py
Browse files- extract.py +22 -1
extract.py
CHANGED
|
@@ -1,21 +1,42 @@
|
|
| 1 |
from selenium import webdriver
|
|
|
|
|
|
|
|
|
|
| 2 |
from selenium.common.exceptions import WebDriverException
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def extract_data(user_input, mode):
|
| 7 |
options = webdriver.ChromeOptions()
|
| 8 |
options.add_argument('--headless')
|
| 9 |
options.add_argument('--no-sandbox')
|
| 10 |
options.add_argument('--disable-dev-shm-usage')
|
|
|
|
| 11 |
|
| 12 |
try:
|
| 13 |
wd = webdriver.Chrome(options=options)
|
| 14 |
wd.set_window_size(1080, 720)
|
|
|
|
| 15 |
# Construir la URL de búsqueda
|
| 16 |
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
|
| 17 |
wd.get(url_busqueda)
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# Obtener el contenido de la página
|
| 21 |
page_content = wd.page_source
|
|
|
|
| 1 |
from selenium import webdriver
|
| 2 |
+
from selenium.webdriver.common.by import By
|
| 3 |
+
from selenium.webdriver.common.keys import Keys
|
| 4 |
+
from selenium.webdriver.chrome.service import Service
|
| 5 |
from selenium.common.exceptions import WebDriverException
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
import time
|
| 8 |
+
import random
|
| 9 |
+
|
| 10 |
+
# Lista de User Agents para rotar
|
| 11 |
+
user_agents = [
|
| 12 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
| 13 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
|
| 14 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
|
| 15 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0",
|
| 16 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
|
| 17 |
+
"Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
def get_random_user_agent():
|
| 21 |
+
return random.choice(user_agents)
|
| 22 |
|
| 23 |
def extract_data(user_input, mode):
|
| 24 |
options = webdriver.ChromeOptions()
|
| 25 |
options.add_argument('--headless')
|
| 26 |
options.add_argument('--no-sandbox')
|
| 27 |
options.add_argument('--disable-dev-shm-usage')
|
| 28 |
+
options.add_argument(f"user-agent={get_random_user_agent()}")
|
| 29 |
|
| 30 |
try:
|
| 31 |
wd = webdriver.Chrome(options=options)
|
| 32 |
wd.set_window_size(1080, 720)
|
| 33 |
+
|
| 34 |
# Construir la URL de búsqueda
|
| 35 |
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
|
| 36 |
wd.get(url_busqueda)
|
| 37 |
+
|
| 38 |
+
# Espera aleatoria para simular el comportamiento humano
|
| 39 |
+
time.sleep(random.uniform(10, 20))
|
| 40 |
|
| 41 |
# Obtener el contenido de la página
|
| 42 |
page_content = wd.page_source
|