|
|
from selenium import webdriver |
|
|
from selenium.webdriver.common.by import By |
|
|
from selenium.common.exceptions import WebDriverException |
|
|
from bs4 import BeautifulSoup |
|
|
import time |
|
|
import random |
|
|
|
|
|
|
|
|
def generate_random_user_agent(): |
|
|
browsers = { |
|
|
"Chrome": range(70, 115), |
|
|
"Firefox": range(60, 110), |
|
|
"Safari": ["13.1", "14.0", "15.0", "16.0", "17.0"], |
|
|
"Edge": range(80, 105), |
|
|
"Opera": range(50, 90), |
|
|
"Brave": range(1, 40), |
|
|
"Vivaldi": range(2, 6), |
|
|
"UC Browser": [f"13.{v}" for v in range(0, 21)] |
|
|
} |
|
|
|
|
|
operating_systems = [ |
|
|
"Windows NT 10.0; Win64; x64", "Windows NT 11.0; Win64; x64", |
|
|
"Macintosh; Intel Mac OS X 10_15_7", "X11; Linux x86_64", |
|
|
"Linux; Android 12; Pixel 5", "iPhone; CPU iPhone OS 16_2 like Mac OS X" |
|
|
] |
|
|
|
|
|
architectures = ["x86", "x86_64", "ARM", "ARM64"] |
|
|
languages = ["en-US", "en-GB", "es-ES", "fr-FR", "de-DE", "it-IT", "pt-BR", "ru-RU", "zh-CN", "ja-JP", "ko-KR"] |
|
|
|
|
|
browser, version_range = random.choice(list(browsers.items())) |
|
|
version = random.choice(version_range) |
|
|
os_version = random.choice(operating_systems) |
|
|
architecture = random.choice(architectures) |
|
|
language = random.choice(languages) |
|
|
|
|
|
return f"Mozilla/5.0 ({os_version}; {architecture}; {language}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36" |
|
|
|
|
|
|
|
|
def get_random_window_size(): |
|
|
window_sizes = [ |
|
|
(1920, 1080), (1366, 768), (1440, 900), (1536, 864), (1280, 800), (1280, 720), (1024, 768) |
|
|
] |
|
|
return random.choice(window_sizes) |
|
|
|
|
|
|
|
|
def generate_random_headers(): |
|
|
languages = ["es-ES,es;q=0.9", "en-US,en;q=0.9", "fr-FR,fr;q=0.9", "de-DE,de;q=0.9"] |
|
|
accept = ["text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
|
|
"application/json,text/html;q=0.9"] |
|
|
return { |
|
|
'accept-language': random.choice(languages), |
|
|
'accept': random.choice(accept) |
|
|
} |
|
|
|
|
|
|
|
|
def simulate_scroll(driver): |
|
|
scroll_pause_time = random.uniform(1, 3) |
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") |
|
|
time.sleep(scroll_pause_time) |
|
|
|
|
|
def extract_data(user_input, mode): |
|
|
options = webdriver.ChromeOptions() |
|
|
options.add_argument('--headless') |
|
|
options.add_argument('--no-sandbox') |
|
|
options.add_argument('--disable-dev-shm-usage') |
|
|
|
|
|
|
|
|
options.add_argument(f"user-agent={generate_random_user_agent()}") |
|
|
|
|
|
|
|
|
headers = generate_random_headers() |
|
|
options.add_argument(f"accept-language={headers['accept-language']}") |
|
|
options.add_argument(f"accept={headers['accept']}") |
|
|
|
|
|
wd = None |
|
|
try: |
|
|
wd = webdriver.Chrome(options=options) |
|
|
window_size = get_random_window_size() |
|
|
wd.set_window_size(window_size[0], window_size[1]) |
|
|
|
|
|
|
|
|
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}" |
|
|
wd.get(url_busqueda) |
|
|
|
|
|
|
|
|
simulate_scroll(wd) |
|
|
|
|
|
|
|
|
time.sleep(random.uniform(4, 6)) |
|
|
|
|
|
|
|
|
page_content = wd.page_source |
|
|
|
|
|
|
|
|
wd.delete_all_cookies() |
|
|
|
|
|
except WebDriverException as e: |
|
|
return [] |
|
|
finally: |
|
|
if wd: |
|
|
wd.quit() |
|
|
|
|
|
|
|
|
soup = BeautifulSoup(page_content, 'html.parser') |
|
|
|
|
|
|
|
|
root_div = soup.find('div', id='root') |
|
|
if not root_div: |
|
|
return [] |
|
|
|
|
|
|
|
|
print(root_div.prettify()) |
|
|
|
|
|
|
|
|
texto_plano = root_div.get_text(separator='\n', strip=True) |
|
|
|
|
|
|
|
|
keyword = "Última actualización" |
|
|
index = texto_plano.find(keyword) |
|
|
if index != -1: |
|
|
texto_plano = texto_plano[index + len(keyword):].strip() |
|
|
|
|
|
|
|
|
lineas = texto_plano.split('\n') |
|
|
lineas_filtradas = [linea for linea in lineas if "Búsquedas" not in linea] |
|
|
|
|
|
|
|
|
for i, linea in enumerate(lineas_filtradas): |
|
|
if "ACTUALIZA A PRO" in linea: |
|
|
lineas_filtradas = lineas_filtradas[:i] |
|
|
break |
|
|
|
|
|
|
|
|
def parsear_texto(lineas): |
|
|
datos_parseados = [] |
|
|
for i in range(0, len(lineas), 7): |
|
|
if i + 6 < len(lineas): |
|
|
palabra_clave = lineas[i] |
|
|
url = lineas[i + 1] |
|
|
volumen = lineas[i + 2] |
|
|
posicion = lineas[i + 3] |
|
|
visitas = lineas[i + 4] |
|
|
sd = lineas[i + 5] |
|
|
ultima_actualizacion = lineas[i + 6] |
|
|
datos_parseados.append({ |
|
|
"Palabras clave": palabra_clave, |
|
|
"URL": url, |
|
|
"Volumen": volumen, |
|
|
"Posición": posicion, |
|
|
"Visitas": visitas, |
|
|
"SD": sd, |
|
|
"Última actualización": ultima_actualizacion |
|
|
}) |
|
|
return datos_parseados |
|
|
|
|
|
|
|
|
datos_parseados = parsear_texto(lineas_filtradas) |
|
|
return datos_parseados |
|
|
|