Update extract.py
Browse files- extract.py +8 -35
extract.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
from selenium import webdriver
|
|
|
|
|
|
|
|
|
|
| 2 |
from selenium.common.exceptions import WebDriverException
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
import time
|
| 5 |
import random
|
| 6 |
-
import logging
|
| 7 |
-
from fp.fp import FreeProxy
|
| 8 |
|
| 9 |
# Lista de User Agents para rotar
|
| 10 |
user_agents = [
|
|
@@ -33,38 +34,19 @@ user_agents = [
|
|
| 33 |
def get_random_user_agent():
|
| 34 |
return random.choice(user_agents)
|
| 35 |
|
| 36 |
-
def get_random_window_size():
|
| 37 |
-
window_sizes = [
|
| 38 |
-
(1920, 1080), (1366, 768), (1440, 900), (1536, 864), (1280, 800), (1280, 720), (1024, 768)
|
| 39 |
-
]
|
| 40 |
-
return random.choice(window_sizes)
|
| 41 |
-
|
| 42 |
-
def get_proxy():
|
| 43 |
-
proxy = FreeProxy(rand=True, timeout=1).get()
|
| 44 |
-
return proxy
|
| 45 |
-
|
| 46 |
def extract_data(user_input, mode):
|
| 47 |
-
proxy = get_proxy()
|
| 48 |
-
proxy_url = f"http://{proxy}"
|
| 49 |
-
|
| 50 |
options = webdriver.ChromeOptions()
|
| 51 |
options.add_argument('--headless')
|
| 52 |
options.add_argument('--no-sandbox')
|
| 53 |
options.add_argument('--disable-dev-shm-usage')
|
| 54 |
options.add_argument(f"user-agent={get_random_user_agent()}")
|
| 55 |
-
options.add_argument('--proxy-server=%s' % proxy_url)
|
| 56 |
|
| 57 |
-
wd = None
|
| 58 |
try:
|
| 59 |
wd = webdriver.Chrome(options=options)
|
| 60 |
-
|
| 61 |
-
wd.set_window_size(window_size[0], window_size[1])
|
| 62 |
|
| 63 |
# Construir la URL de b煤squeda
|
| 64 |
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
|
| 65 |
-
|
| 66 |
-
logging.info(f"Making request to {url_busqueda} with IP: {proxy_url}")
|
| 67 |
-
|
| 68 |
wd.get(url_busqueda)
|
| 69 |
|
| 70 |
# Espera aleatoria para simular el comportamiento humano
|
|
@@ -73,15 +55,7 @@ def extract_data(user_input, mode):
|
|
| 73 |
# Obtener el contenido de la p谩gina
|
| 74 |
page_content = wd.page_source
|
| 75 |
|
| 76 |
-
# Obtener el c贸digo de respuesta HTTP
|
| 77 |
-
response_status = wd.execute_script("return document.readyState")
|
| 78 |
-
if response_status == "complete":
|
| 79 |
-
logging.info(f"Request with IP: {proxy_url} returned status code 200")
|
| 80 |
-
else:
|
| 81 |
-
logging.warning(f"Request with IP: {proxy_url} did not return status code 200")
|
| 82 |
-
|
| 83 |
except WebDriverException as e:
|
| 84 |
-
logging.error(f"Request failed with proxy {proxy_url}. Error: {e}")
|
| 85 |
return []
|
| 86 |
finally:
|
| 87 |
if wd:
|
|
@@ -93,19 +67,18 @@ def extract_data(user_input, mode):
|
|
| 93 |
# Buscar el div con id="root"
|
| 94 |
root_div = soup.find('div', id='root')
|
| 95 |
if not root_div:
|
| 96 |
-
logging.error("No se encontr贸 el div con id 'root'")
|
| 97 |
return []
|
| 98 |
|
|
|
|
|
|
|
|
|
|
| 99 |
# Extraer el texto plano dentro del div
|
| 100 |
texto_plano = root_div.get_text(separator='\n', strip=True)
|
| 101 |
|
| 102 |
-
# Log el contenido del div root
|
| 103 |
-
logging.info(f"Contenido del div 'root':\n{texto_plano}")
|
| 104 |
-
|
| 105 |
# Buscar la palabra clave espec铆fica "脷ltima actualizaci贸n" y descartar todo lo anterior
|
| 106 |
keyword = "脷ltima actualizaci贸n"
|
| 107 |
index = texto_plano.find(keyword)
|
| 108 |
-
if index != -1:
|
| 109 |
texto_plano = texto_plano[index + len(keyword):].strip()
|
| 110 |
|
| 111 |
# Eliminar todas las l铆neas que contienen la palabra "B煤squedas"
|
|
|
|
| 1 |
from selenium import webdriver
|
| 2 |
+
from selenium.webdriver.common.by import By
|
| 3 |
+
from selenium.webdriver.common.keys import Keys
|
| 4 |
+
from selenium.webdriver.chrome.service import Service
|
| 5 |
from selenium.common.exceptions import WebDriverException
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
import time
|
| 8 |
import random
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Lista de User Agents para rotar
|
| 11 |
user_agents = [
|
|
|
|
| 34 |
def get_random_user_agent():
|
| 35 |
return random.choice(user_agents)
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
def extract_data(user_input, mode):
|
|
|
|
|
|
|
|
|
|
| 38 |
options = webdriver.ChromeOptions()
|
| 39 |
options.add_argument('--headless')
|
| 40 |
options.add_argument('--no-sandbox')
|
| 41 |
options.add_argument('--disable-dev-shm-usage')
|
| 42 |
options.add_argument(f"user-agent={get_random_user_agent()}")
|
|
|
|
| 43 |
|
|
|
|
| 44 |
try:
|
| 45 |
wd = webdriver.Chrome(options=options)
|
| 46 |
+
wd.set_window_size(1080, 720)
|
|
|
|
| 47 |
|
| 48 |
# Construir la URL de b煤squeda
|
| 49 |
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
|
|
|
|
|
|
|
|
|
|
| 50 |
wd.get(url_busqueda)
|
| 51 |
|
| 52 |
# Espera aleatoria para simular el comportamiento humano
|
|
|
|
| 55 |
# Obtener el contenido de la p谩gina
|
| 56 |
page_content = wd.page_source
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
except WebDriverException as e:
|
|
|
|
| 59 |
return []
|
| 60 |
finally:
|
| 61 |
if wd:
|
|
|
|
| 67 |
# Buscar el div con id="root"
|
| 68 |
root_div = soup.find('div', id='root')
|
| 69 |
if not root_div:
|
|
|
|
| 70 |
return []
|
| 71 |
|
| 72 |
+
# Imprimir el contenido del div con id="root"
|
| 73 |
+
print(root_div.prettify())
|
| 74 |
+
|
| 75 |
# Extraer el texto plano dentro del div
|
| 76 |
texto_plano = root_div.get_text(separator='\n', strip=True)
|
| 77 |
|
|
|
|
|
|
|
|
|
|
| 78 |
# Buscar la palabra clave espec铆fica "脷ltima actualizaci贸n" y descartar todo lo anterior
|
| 79 |
keyword = "脷ltima actualizaci贸n"
|
| 80 |
index = texto_plano.find(keyword)
|
| 81 |
+
if (index != -1):
|
| 82 |
texto_plano = texto_plano[index + len(keyword):].strip()
|
| 83 |
|
| 84 |
# Eliminar todas las l铆neas que contienen la palabra "B煤squedas"
|