Update extract.py
Browse files- extract.py +43 -11
extract.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
from selenium import webdriver
|
| 2 |
from selenium.webdriver.common.by import By
|
| 3 |
-
from selenium.webdriver.chrome.service import Service
|
| 4 |
from selenium.common.exceptions import WebDriverException
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
import time
|
|
@@ -9,14 +8,14 @@ import random
|
|
| 9 |
# Genera un User Agent aleatorio
|
| 10 |
def generate_random_user_agent():
|
| 11 |
browsers = {
|
| 12 |
-
"Chrome": range(70, 115),
|
| 13 |
-
"Firefox": range(60, 110),
|
| 14 |
"Safari": ["13.1", "14.0", "15.0", "16.0", "17.0"],
|
| 15 |
-
"Edge": range(80, 105),
|
| 16 |
-
"Opera": range(50, 90),
|
| 17 |
-
"Brave": range(1, 40),
|
| 18 |
-
"Vivaldi": range(2, 6),
|
| 19 |
-
"UC Browser": [f"13.{v}" for v in range(0, 21)]
|
| 20 |
}
|
| 21 |
|
| 22 |
operating_systems = [
|
|
@@ -36,6 +35,29 @@ def generate_random_user_agent():
|
|
| 36 |
|
| 37 |
return f"Mozilla/5.0 ({os_version}; {architecture}; {language}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36"
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def extract_data(user_input, mode):
|
| 40 |
options = webdriver.ChromeOptions()
|
| 41 |
options.add_argument('--headless')
|
|
@@ -45,17 +67,27 @@ def extract_data(user_input, mode):
|
|
| 45 |
# Usa un User Agent aleatorio para cada petición
|
| 46 |
options.add_argument(f"user-agent={generate_random_user_agent()}")
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
try:
|
| 49 |
wd = webdriver.Chrome(options=options)
|
| 50 |
-
|
|
|
|
| 51 |
|
| 52 |
# Construir la URL de búsqueda
|
| 53 |
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
|
| 54 |
wd.get(url_busqueda)
|
| 55 |
|
|
|
|
|
|
|
|
|
|
| 56 |
# Espera aleatoria para simular el comportamiento humano
|
| 57 |
-
time.sleep(random.uniform(
|
| 58 |
-
|
| 59 |
# Obtener el contenido de la página
|
| 60 |
page_content = wd.page_source
|
| 61 |
|
|
|
|
| 1 |
from selenium import webdriver
|
| 2 |
from selenium.webdriver.common.by import By
|
|
|
|
| 3 |
from selenium.common.exceptions import WebDriverException
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import time
|
|
|
|
| 8 |
# Genera un User Agent aleatorio
|
| 9 |
def generate_random_user_agent():
|
| 10 |
browsers = {
|
| 11 |
+
"Chrome": range(70, 115),
|
| 12 |
+
"Firefox": range(60, 110),
|
| 13 |
"Safari": ["13.1", "14.0", "15.0", "16.0", "17.0"],
|
| 14 |
+
"Edge": range(80, 105),
|
| 15 |
+
"Opera": range(50, 90),
|
| 16 |
+
"Brave": range(1, 40),
|
| 17 |
+
"Vivaldi": range(2, 6),
|
| 18 |
+
"UC Browser": [f"13.{v}" for v in range(0, 21)]
|
| 19 |
}
|
| 20 |
|
| 21 |
operating_systems = [
|
|
|
|
| 35 |
|
| 36 |
return f"Mozilla/5.0 ({os_version}; {architecture}; {language}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36"
|
| 37 |
|
| 38 |
+
# Generar un tamaño de ventana aleatorio
|
| 39 |
+
def get_random_window_size():
|
| 40 |
+
window_sizes = [
|
| 41 |
+
(1920, 1080), (1366, 768), (1440, 900), (1536, 864), (1280, 800), (1280, 720), (1024, 768)
|
| 42 |
+
]
|
| 43 |
+
return random.choice(window_sizes)
|
| 44 |
+
|
| 45 |
+
# Generar cabeceras HTTP aleatorias
|
| 46 |
+
def generate_random_headers():
|
| 47 |
+
languages = ["es-ES,es;q=0.9", "en-US,en;q=0.9", "fr-FR,fr;q=0.9", "de-DE,de;q=0.9"]
|
| 48 |
+
accept = ["text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
| 49 |
+
"application/json,text/html;q=0.9"]
|
| 50 |
+
return {
|
| 51 |
+
'accept-language': random.choice(languages),
|
| 52 |
+
'accept': random.choice(accept)
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
# Simular scroll en la página
|
| 56 |
+
def simulate_scroll(driver):
|
| 57 |
+
scroll_pause_time = random.uniform(1, 3)
|
| 58 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 59 |
+
time.sleep(scroll_pause_time)
|
| 60 |
+
|
| 61 |
def extract_data(user_input, mode):
|
| 62 |
options = webdriver.ChromeOptions()
|
| 63 |
options.add_argument('--headless')
|
|
|
|
| 67 |
# Usa un User Agent aleatorio para cada petición
|
| 68 |
options.add_argument(f"user-agent={generate_random_user_agent()}")
|
| 69 |
|
| 70 |
+
# Generar cabeceras HTTP aleatorias
|
| 71 |
+
headers = generate_random_headers()
|
| 72 |
+
options.add_argument(f"accept-language={headers['accept-language']}")
|
| 73 |
+
options.add_argument(f"accept={headers['accept']}")
|
| 74 |
+
|
| 75 |
+
wd = None
|
| 76 |
try:
|
| 77 |
wd = webdriver.Chrome(options=options)
|
| 78 |
+
window_size = get_random_window_size()
|
| 79 |
+
wd.set_window_size(window_size[0], window_size[1]) # Ajusta el tamaño de la ventana aleatoriamente
|
| 80 |
|
| 81 |
# Construir la URL de búsqueda
|
| 82 |
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
|
| 83 |
wd.get(url_busqueda)
|
| 84 |
|
| 85 |
+
# Simular scroll en la página
|
| 86 |
+
simulate_scroll(wd)
|
| 87 |
+
|
| 88 |
# Espera aleatoria para simular el comportamiento humano
|
| 89 |
+
time.sleep(random.uniform(3, 5)) # Espera más larga para simular un comportamiento humano
|
| 90 |
+
|
| 91 |
# Obtener el contenido de la página
|
| 92 |
page_content = wd.page_source
|
| 93 |
|