tx3bas commited on
Commit
8b37454
·
verified ·
1 Parent(s): ce5ecf0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -194
app.py CHANGED
@@ -3,220 +3,95 @@ import requests
3
  import urllib.parse
4
  import re
5
  import xmltodict
6
- from itertools import product
7
  import nltk
8
  from nltk.corpus import stopwords
9
  from nltk.stem import SnowballStemmer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # Descargar recursos de NLTK
12
- nltk.download('stopwords', quiet=True)
13
- nltk.download('punkt', quiet=True)
 
 
 
 
14
 
15
  # Inicializar el stemmer y la lista de stopwords
16
  stemmer = SnowballStemmer("spanish")
17
  stop_words = set(stopwords.words('spanish'))
 
18
 
19
- # Función para obtener sugerencias de DuckDuckGo
20
- def fetch_duckduckgo_suggestions(query, lang_code="es"):
21
- encoded_query = urllib.parse.quote(query)
22
- url = f"https://duckduckgo.com/ac/?q={encoded_query}&kl={lang_code}"
23
- response = requests.get(url)
24
- if response.status_code == 200:
25
- try:
26
- data = response.json()
27
- return [item['phrase'] for item in data]
28
- except ValueError:
29
- print("Error decodificando JSON de DuckDuckGo")
30
- return []
31
- else:
32
- return []
33
-
34
- # Función para obtener sugerencias de Google
35
- def fetch_google_suggestions(query, lang_code="es"):
36
- encoded_query = urllib.parse.quote(query)
37
- url = f"http://suggestqueries.google.com/complete/search?client=firefox&hl={lang_code}&q={encoded_query}"
38
- response = requests.get(url)
39
- if response.status_code == 200:
40
- try:
41
- return response.json()[1]
42
- except ValueError:
43
- print("Error decodificando JSON de Google")
44
- return []
45
- else:
46
- return []
47
-
48
- # Función para obtener sugerencias de YouTube
49
- def fetch_youtube_suggestions(query, lang_code="es"):
50
- encoded_query = urllib.parse.quote(query)
51
- url = f"http://suggestqueries.google.com/complete/search?client=youtube&hl={lang_code}&q={encoded_query}"
52
- response = requests.get(url)
53
-
54
- if response.status_code == 200:
55
- try:
56
- match = re.search(r'window\.google\.ac\.h\(\["[^"]*",\[(.*?)\],', response.text)
57
- if match:
58
- suggestions_data = match.group(1)
59
- suggestions = re.findall(r'\["([^"]+)"', suggestions_data)
60
- return suggestions
61
- else:
62
- print("No se encontraron sugerencias en el formato esperado.")
63
- return []
64
- except Exception as e:
65
- print(f"Error procesando la respuesta de YouTube: {e}")
66
- return []
67
- else:
68
- return []
69
-
70
- # Función para obtener sugerencias de Bing
71
- def fetch_bing_suggestions(query, market="es-ES"):
72
- url = "https://api.bing.com/qsml.aspx"
73
- params = {
74
- "Market": market,
75
- "query": query
76
- }
77
- headers = {
78
- "User-agent": "Mozilla/5.0"
79
- }
80
- response = requests.get(url, params=params, headers=headers)
81
-
82
- if response.status_code == 200:
83
- try:
84
- obj = xmltodict.parse(response.content)
85
- suggestList = []
86
- if 'SearchSuggestion' in obj and obj['SearchSuggestion']['Section']:
87
- suggestions = obj['SearchSuggestion']['Section']['Item']
88
- if isinstance(suggestions, list):
89
- for s in suggestions:
90
- suggestList.append(s['Text'])
91
- elif isinstance(suggestions, dict):
92
- suggestList.append(suggestions['Text'])
93
- return suggestList
94
- except Exception as e:
95
- print(f"Error procesando la respuesta de Bing: {e}")
96
- return []
97
- else:
98
- return []
99
-
100
- # Función para obtener sugerencias de Amazon
101
- def fetch_amazon_suggestions(query, market_id="A1F83G8C2ARO7P", alias="aps"):
102
- url = "https://completion.amazon.com/api/2017/suggestions"
103
- params = {
104
- "mid": market_id,
105
- "alias": alias,
106
- "prefix": query
107
- }
108
- response = requests.get(url, params=params)
109
-
110
- if response.status_code == 200:
111
- try:
112
- data = response.json()
113
- return [item['value'] for item in data.get('suggestions', [])]
114
- except ValueError:
115
- print("Error decodificando JSON de Amazon")
116
- return []
117
- else:
118
- return []
119
 
120
- # Función para expandir la palabra clave
121
- def expand_keyword(keyword):
122
- # Tokenizar la palabra clave
123
- tokens = nltk.word_tokenize(keyword.lower())
124
-
125
- # Eliminar stopwords y aplicar stemming
126
- tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
127
-
128
- # Generar variaciones
129
- variations = []
130
- for i in range(1, len(tokens) + 1):
131
- variations.extend(list(product(tokens, repeat=i)))
132
-
133
- # Convertir tuplas a strings
134
- expanded_keywords = [" ".join(variation) for variation in variations]
135
 
136
- # Añadir variaciones con prefijos y sufijos
137
- prefixes = ['como', 'que', 'donde', 'cuando', 'por que', 'cual']
138
- suffixes = ['gratis', 'online', 'pdf', 'precios', 'opiniones']
 
 
 
 
 
 
139
 
140
- for prefix in prefixes:
141
- expanded_keywords.append(f"{prefix} {keyword}")
 
142
 
143
- for suffix in suffixes:
144
- expanded_keywords.append(f"{keyword} {suffix}")
145
 
146
- # Añadir plurales (simplificado)
147
- expanded_keywords.extend([f"{kw}s" for kw in expanded_keywords])
 
 
 
 
 
 
148
 
149
- # Eliminar duplicados y la palabra clave original vacía
150
- expanded_keywords = list(set(expanded_keywords) - {''})
 
 
 
151
 
152
- return expanded_keywords
153
-
154
- # Función principal
155
- def main(keyword):
156
- expanded_keywords = expand_keyword(keyword)
157
- all_suggestions = {}
158
- platform_suggestions = {
159
- 'Google': set(),
160
- 'DuckDuckGo': set(),
161
- 'YouTube': set(),
162
- 'Bing': set(),
163
- 'Amazon': set()
164
- }
165
-
166
- # Obtener sugerencias de todas las plataformas
167
- for exp_keyword in expanded_keywords:
168
- for platform, fetch_func in [
169
- ('Google', fetch_google_suggestions),
170
- ('DuckDuckGo', fetch_duckduckgo_suggestions),
171
- ('YouTube', fetch_youtube_suggestions),
172
- ('Bing', fetch_bing_suggestions),
173
- ('Amazon', fetch_amazon_suggestions)
174
- ]:
175
- suggestions = fetch_func(exp_keyword)
176
- platform_suggestions[platform].update(suggestions)
177
- for suggestion in suggestions:
178
- if suggestion in all_suggestions:
179
- all_suggestions[suggestion] += 1
180
- else:
181
- all_suggestions[suggestion] = 1
182
-
183
- # Ordenar y filtrar las sugerencias más frecuentes combinadas
184
- sorted_suggestions = sorted(all_suggestions.items(), key=lambda item: item[1], reverse=True)
185
- combined_top_suggestions = [sug for sug, freq in sorted_suggestions if freq >= 2][:50] # Aumentado a top 50
186
- suggestions_str = ", ".join(combined_top_suggestions)
187
-
188
- # Crear la lista de todas las palabras clave con su número de repeticiones
189
- all_suggestions_str = "<ul>"
190
- for suggestion, freq in sorted_suggestions:
191
- all_suggestions_str += f"<li>{suggestion} - {freq} repeticiones</li>"
192
- all_suggestions_str += "</ul>"
193
-
194
- # Crear el HTML de salida
195
- html_output = f"""
196
  <div>
197
- <b>Top 50 Sugerencias combinadas:</b> <span id='suggestions_text'>{suggestions_str}</span>
198
  <button class="lg secondary svelte-cmf5ev" style="font-size: small; padding: 2px; color: #808080ba; border: none; margin-left: 5px;"
199
- onclick='navigator.clipboard.writeText(document.getElementById("suggestions_text").innerText).then(() => alert("Texto copiado al portapapeles"))'>&nbsp;✂&nbsp;</button>
200
  </div>
201
  """
202
-
203
- # Agregar las top sugerencias de cada plataforma
204
- for platform, suggestions in platform_suggestions.items():
205
- html_output += f"""
206
- <h4>Top 20 Sugerencias de {platform}:</h4>
207
- <ul>
208
- """
209
- for suggestion in list(suggestions)[:20]:
210
- freq = all_suggestions[suggestion]
211
- html_output += f"<li>{suggestion} ({freq})</li>"
212
- html_output += "</ul>"
213
-
214
- # Agregar la lista completa de todas las palabras clave
215
- html_output += """
216
- <h4>Lista completa de palabras clave con su número de repeticiones:</h4>
217
- """
218
- html_output += all_suggestions_str
219
-
220
  return html_output
221
 
222
  # Interfaz de Gradio
@@ -225,7 +100,7 @@ iface = gr.Interface(
225
  inputs="text",
226
  outputs="html",
227
  title="<div style='margin:0 auto;text-align:center'><div style='margin:0 auto;text-align:center'><img style='width:100px;display: inline-table;margin-bottom:-10px' src='https://artxeweb.com/media/files/search.jpg'><p>Sugerencias Combinadas de Google, DuckDuckGo, YouTube, Bing y Amazon</p></div>",
228
- description="<p style='margin-bottom:10px;text-align:center;background: #ffffff; padding: 8px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb;'>Ingrese una palabra clave para obtener sugerencias de búsqueda relacionadas de Google, DuckDuckGo, YouTube, Bing y Amazon. Se mostrarán las 50 primeras sugerencias combinadas y también las 20 principales de cada plataforma por separado.</p>",
229
  article="<div style='margin-top:10px'><p style='text-align: center !important; background: #ffffff; padding: 5px 30px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb; width: fit-content; margin: auto;'>Desarrollada por <a style='text-decoration: none !important; color: #e12a31 !important;' href='https://artxeweb.com'>© Artxe Web</a></p></div>"
230
  )
231
 
 
3
  import urllib.parse
4
  import re
5
  import xmltodict
6
+ from itertools import product, combinations
7
  import nltk
8
  from nltk.corpus import stopwords
9
  from nltk.stem import SnowballStemmer
10
+ from collections import Counter
11
+ import concurrent.futures
12
+ import ssl
13
+ import os
14
+
15
+ # Configurar SSL para la descarga de NLTK
16
+ try:
17
+ _create_unverified_https_context = ssl._create_unverified_context
18
+ except AttributeError:
19
+ pass
20
+ else:
21
+ ssl._create_default_https_context = _create_unverified_https_context
22
+
23
+ # Función para descargar recursos de NLTK
24
+ def download_nltk_resources():
25
+ resources = ['punkt', 'stopwords', 'words']
26
+ for resource in resources:
27
+ try:
28
+ nltk.data.find(f'tokenizers/{resource}')
29
+ except LookupError:
30
+ print(f"Descargando {resource}...")
31
+ nltk.download(resource, quiet=True)
32
 
33
  # Descargar recursos de NLTK
34
+ download_nltk_resources()
35
+
36
+ # Configurar el directorio de datos de NLTK
37
+ nltk_data_dir = '/tmp/nltk_data'
38
+ os.makedirs(nltk_data_dir, exist_ok=True)
39
+ nltk.data.path.append(nltk_data_dir)
40
 
41
  # Inicializar el stemmer y la lista de stopwords
42
  stemmer = SnowballStemmer("spanish")
43
  stop_words = set(stopwords.words('spanish'))
44
+ english_words = set(nltk.corpus.words.words())
45
 
46
+ # El resto del código permanece igual...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ # Función principal
49
+ def main(keyword):
50
+ expanded_keywords = expand_keyword(keyword)
51
+ all_suggestions = []
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
54
+ future_to_keyword = {executor.submit(fetch_all_suggestions, kw): kw for kw in expanded_keywords}
55
+ for future in concurrent.futures.as_completed(future_to_keyword):
56
+ kw = future_to_keyword[future]
57
+ try:
58
+ suggestions = future.result()
59
+ all_suggestions.extend(suggestions)
60
+ except Exception as exc:
61
+ print(f'{kw} generated an exception: {exc}')
62
 
63
+ # Contar las sugerencias y sus plataformas
64
+ suggestion_counter = Counter(sugg for sugg, _ in all_suggestions)
65
+ platform_counter = Counter(platform for _, platform in all_suggestions)
66
 
67
+ # Ordenar las sugerencias por frecuencia
68
+ sorted_suggestions = sorted(suggestion_counter.items(), key=lambda x: x[1], reverse=True)
69
 
70
+ # Crear el HTML de salida
71
+ html_output = "<h3>Todas las sugerencias ordenadas por relevancia:</h3><ul>"
72
+ for suggestion, count in sorted_suggestions:
73
+ platforms = [platform for sugg, platform in all_suggestions if sugg == suggestion]
74
+ platforms_str = ", ".join(set(platforms))
75
+ relevance_score = count * len(set(platforms))
76
+ html_output += f"<li><strong>{suggestion}</strong> (Frecuencia: {count}, Plataformas: {platforms_str}, Puntuación de relevancia: {relevance_score})</li>"
77
+ html_output += "</ul>"
78
 
79
+ # Añadir estadísticas por plataforma
80
+ html_output += "<h3>Estadísticas por plataforma:</h3><ul>"
81
+ for platform, count in platform_counter.most_common():
82
+ html_output += f"<li>{platform}: {count} sugerencias</li>"
83
+ html_output += "</ul>"
84
 
85
+ # Añadir botón para copiar todas las sugerencias
86
+ all_suggestions_text = ", ".join(sugg for sugg, _ in sorted_suggestions)
87
+ html_output += f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  <div>
89
+ <h3>Copiar todas las sugerencias:</h3>
90
  <button class="lg secondary svelte-cmf5ev" style="font-size: small; padding: 2px; color: #808080ba; border: none; margin-left: 5px;"
91
+ onclick='navigator.clipboard.writeText(`{all_suggestions_text}`).then(() => alert("Todas las sugerencias copiadas al portapapeles"))'>&nbsp;Copiar todas las sugerencias&nbsp;</button>
92
  </div>
93
  """
94
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  return html_output
96
 
97
  # Interfaz de Gradio
 
100
  inputs="text",
101
  outputs="html",
102
  title="<div style='margin:0 auto;text-align:center'><div style='margin:0 auto;text-align:center'><img style='width:100px;display: inline-table;margin-bottom:-10px' src='https://artxeweb.com/media/files/search.jpg'><p>Sugerencias Combinadas de Google, DuckDuckGo, YouTube, Bing y Amazon</p></div>",
103
+ description="<p style='margin-bottom:10px;text-align:center;background: #ffffff; padding: 8px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb;'>Ingrese una palabra clave para obtener sugerencias de búsqueda relacionadas de Google, DuckDuckGo, YouTube, Bing y Amazon. Se mostrarán todas las sugerencias ordenadas por relevancia.</p>",
104
  article="<div style='margin-top:10px'><p style='text-align: center !important; background: #ffffff; padding: 5px 30px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb; width: fit-content; margin: auto;'>Desarrollada por <a style='text-decoration: none !important; color: #e12a31 !important;' href='https://artxeweb.com'>© Artxe Web</a></p></div>"
105
  )
106