tx3bas commited on
Commit
12de023
·
verified ·
1 Parent(s): 1e8381d

Update extract.py

Browse files
Files changed (1) hide show
  1. extract.py +43 -11
extract.py CHANGED
@@ -1,6 +1,5 @@
1
  from selenium import webdriver
2
  from selenium.webdriver.common.by import By
3
- from selenium.webdriver.chrome.service import Service
4
  from selenium.common.exceptions import WebDriverException
5
  from bs4 import BeautifulSoup
6
  import time
@@ -9,14 +8,14 @@ import random
9
  # Genera un User Agent aleatorio
10
  def generate_random_user_agent():
11
  browsers = {
12
- "Chrome": range(70, 115), # Versiones 70 a 114
13
- "Firefox": range(60, 110), # Versiones 60 a 109
14
  "Safari": ["13.1", "14.0", "15.0", "16.0", "17.0"],
15
- "Edge": range(80, 105), # Versiones 80 a 104
16
- "Opera": range(50, 90), # Versiones 50 a 89
17
- "Brave": range(1, 40), # Versiones 1 a 39
18
- "Vivaldi": range(2, 6), # Versiones 2 a 5
19
- "UC Browser": [f"13.{v}" for v in range(0, 21)] # Versiones 13.0 a 13.20
20
  }
21
 
22
  operating_systems = [
@@ -36,6 +35,29 @@ def generate_random_user_agent():
36
 
37
  return f"Mozilla/5.0 ({os_version}; {architecture}; {language}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36"
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def extract_data(user_input, mode):
40
  options = webdriver.ChromeOptions()
41
  options.add_argument('--headless')
@@ -45,17 +67,27 @@ def extract_data(user_input, mode):
45
  # Usa un User Agent aleatorio para cada petición
46
  options.add_argument(f"user-agent={generate_random_user_agent()}")
47
 
 
 
 
 
 
 
48
  try:
49
  wd = webdriver.Chrome(options=options)
50
- wd.set_window_size(1080, 720)
 
51
 
52
  # Construir la URL de búsqueda
53
  url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
54
  wd.get(url_busqueda)
55
 
 
 
 
56
  # Espera aleatoria para simular el comportamiento humano
57
- time.sleep(random.uniform(10, 20))
58
-
59
  # Obtener el contenido de la página
60
  page_content = wd.page_source
61
 
 
1
  from selenium import webdriver
2
  from selenium.webdriver.common.by import By
 
3
  from selenium.common.exceptions import WebDriverException
4
  from bs4 import BeautifulSoup
5
  import time
 
8
  # Genera un User Agent aleatorio
9
  def generate_random_user_agent():
10
  browsers = {
11
+ "Chrome": range(70, 115),
12
+ "Firefox": range(60, 110),
13
  "Safari": ["13.1", "14.0", "15.0", "16.0", "17.0"],
14
+ "Edge": range(80, 105),
15
+ "Opera": range(50, 90),
16
+ "Brave": range(1, 40),
17
+ "Vivaldi": range(2, 6),
18
+ "UC Browser": [f"13.{v}" for v in range(0, 21)]
19
  }
20
 
21
  operating_systems = [
 
35
 
36
  return f"Mozilla/5.0 ({os_version}; {architecture}; {language}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36"
37
 
38
+ # Generar un tamaño de ventana aleatorio
39
+ def get_random_window_size():
40
+ window_sizes = [
41
+ (1920, 1080), (1366, 768), (1440, 900), (1536, 864), (1280, 800), (1280, 720), (1024, 768)
42
+ ]
43
+ return random.choice(window_sizes)
44
+
45
+ # Generar cabeceras HTTP aleatorias
46
+ def generate_random_headers():
47
+ languages = ["es-ES,es;q=0.9", "en-US,en;q=0.9", "fr-FR,fr;q=0.9", "de-DE,de;q=0.9"]
48
+ accept = ["text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
49
+ "application/json,text/html;q=0.9"]
50
+ return {
51
+ 'accept-language': random.choice(languages),
52
+ 'accept': random.choice(accept)
53
+ }
54
+
55
+ # Simular scroll en la página
56
+ def simulate_scroll(driver):
57
+ scroll_pause_time = random.uniform(1, 3)
58
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
59
+ time.sleep(scroll_pause_time)
60
+
61
  def extract_data(user_input, mode):
62
  options = webdriver.ChromeOptions()
63
  options.add_argument('--headless')
 
67
  # Usa un User Agent aleatorio para cada petición
68
  options.add_argument(f"user-agent={generate_random_user_agent()}")
69
 
70
+ # Generar cabeceras HTTP aleatorias
71
+ headers = generate_random_headers()
72
+ options.add_argument(f"accept-language={headers['accept-language']}")
73
+ options.add_argument(f"accept={headers['accept']}")
74
+
75
+ wd = None
76
  try:
77
  wd = webdriver.Chrome(options=options)
78
+ window_size = get_random_window_size()
79
+ wd.set_window_size(window_size[0], window_size[1]) # Ajusta el tamaño de la ventana aleatoriamente
80
 
81
  # Construir la URL de búsqueda
82
  url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
83
  wd.get(url_busqueda)
84
 
85
+ # Simular scroll en la página
86
+ simulate_scroll(wd)
87
+
88
  # Espera aleatoria para simular el comportamiento humano
89
+ time.sleep(random.uniform(3, 5)) # Espera más larga para simular un comportamiento humano
90
+
91
  # Obtener el contenido de la página
92
  page_content = wd.page_source
93