Upload 5 files
Browse files- README.md +53 -23
- app.py +395 -377
- requirements.txt +27 -5
- test_app.py +19 -0
- web_scraper_tool.py +361 -146
README.md
CHANGED
|
@@ -1,42 +1,72 @@
|
|
| 1 |
---
|
| 2 |
-
title: Web Scraper Tool
|
| 3 |
emoji: 🕸️
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
-
pinned:
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
# 🕸️ Web Scraper Tool
|
| 13 |
|
| 14 |
-
Una herramienta
|
| 15 |
-
Esta aplicación está optimizada para generar archivos que puedan ser procesados por Copilot.
|
| 16 |
|
| 17 |
## ✨ Características
|
| 18 |
|
| 19 |
-
-
|
| 20 |
-
-
|
| 21 |
-
-
|
| 22 |
-
-
|
| 23 |
-
-
|
|
|
|
| 24 |
|
| 25 |
-
## 🚀
|
| 26 |
|
| 27 |
-
1. Ingresa la URL
|
| 28 |
-
2. Selecciona
|
| 29 |
-
3. Haz clic en "
|
| 30 |
-
4. Descarga
|
| 31 |
|
| 32 |
-
##
|
| 33 |
|
| 34 |
-
-
|
| 35 |
-
-
|
| 36 |
-
-
|
| 37 |
-
-
|
| 38 |
-
-
|
| 39 |
|
| 40 |
-
##
|
| 41 |
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: 🕸️ Web Scraper Tool
|
| 3 |
emoji: 🕸️
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.1
|
| 8 |
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
license: mit
|
| 11 |
+
short_description: Extrae contenido web y convierte a PDF/TXT para Copilot
|
| 12 |
---
|
| 13 |
|
| 14 |
# 🕸️ Web Scraper Tool
|
| 15 |
|
| 16 |
+
Una herramienta profesional para extraer contenido de páginas web y convertirlo a formatos compatibles con Microsoft Copilot (PDF y TXT).
|
|
|
|
| 17 |
|
| 18 |
## ✨ Características
|
| 19 |
|
| 20 |
+
- **URLs flexibles**: Funciona con cualquier formato de URL (HTTP, HTTPS, con/sin www, mayúsculas/minúsculas)
|
| 21 |
+
- **Detección automática**: Identifica automáticamente si el contenido es una imagen o texto
|
| 22 |
+
- **Múltiples formatos**: Genera archivos PDF (con formato visual) o TXT (texto plano)
|
| 23 |
+
- **Optimizado para Copilot**: Los archivos están específicamente formateados para Microsoft Copilot
|
| 24 |
+
- **Interfaz minimalista**: Diseño profesional y fácil de usar
|
| 25 |
+
- **Procesamiento robusto**: Manejo inteligente de errores y normalización de URLs
|
| 26 |
|
| 27 |
+
## 🚀 Cómo usar
|
| 28 |
|
| 29 |
+
1. **Ingresa la URL**: Pega cualquier URL de página web (soporta formatos como "Https://EXAMPLE.com")
|
| 30 |
+
2. **Selecciona formato**: Elige entre PDF (visual) o TXT (solo texto)
|
| 31 |
+
3. **Procesa**: Haz clic en "Extraer y Convertir"
|
| 32 |
+
4. **Descarga**: Obtén tu archivo listo para usar con Copilot
|
| 33 |
|
| 34 |
+
## 🎯 Casos de uso
|
| 35 |
|
| 36 |
+
- Extraer artículos y documentos para análisis con IA
|
| 37 |
+
- Convertir páginas web a formato legible por Copilot
|
| 38 |
+
- Guardar contenido de foros y discusiones (como Spiceworks)
|
| 39 |
+
- Procesar documentación técnica
|
| 40 |
+
- Extraer texto de páginas con mucho código HTML
|
| 41 |
|
| 42 |
+
## 🛠️ Tecnología
|
| 43 |
|
| 44 |
+
- **Frontend**: Gradio 4.44.1 con diseño minimalista personalizado
|
| 45 |
+
- **Web Scraping**: Beautiful Soup + Requests con headers inteligentes
|
| 46 |
+
- **Conversión PDF**: WeasyPrint con optimizaciones para texto
|
| 47 |
+
- **Procesamiento**: Python con manejo robusto de errores
|
| 48 |
+
|
| 49 |
+
## 📝 Formatos soportados
|
| 50 |
+
|
| 51 |
+
### PDF
|
| 52 |
+
- Mantiene formato visual y estructura
|
| 53 |
+
- Incluye estilos CSS básicos
|
| 54 |
+
- Ideal para documentos con formato
|
| 55 |
+
|
| 56 |
+
### TXT
|
| 57 |
+
- Texto plano limpio
|
| 58 |
+
- Incluye metadatos del contenido
|
| 59 |
+
- Perfecto para análisis de texto con IA
|
| 60 |
+
|
| 61 |
+
## 🔧 Características técnicas
|
| 62 |
+
|
| 63 |
+
- Normalización automática de URLs
|
| 64 |
+
- Detección de content-type HTTP
|
| 65 |
+
- Headers rotativos para evitar bloqueos
|
| 66 |
+
- Timeout configurables
|
| 67 |
+
- Manejo de encoding automático
|
| 68 |
+
- Limpieza inteligente de HTML
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
Desarrollado con ❤️ para maximizar la compatibilidad con herramientas de IA como Microsoft Copilot.
|
app.py
CHANGED
|
@@ -1,380 +1,398 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
-
import
|
| 3 |
-
from
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
import
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
}
|
| 123 |
-
img {
|
| 124 |
-
max-width: 100%;
|
| 125 |
-
height: auto;
|
| 126 |
-
}
|
| 127 |
-
table {
|
| 128 |
-
border-collapse: collapse;
|
| 129 |
-
width: 100%;
|
| 130 |
-
}
|
| 131 |
-
th, td {
|
| 132 |
-
border: 1px solid #ddd;
|
| 133 |
-
padding: 8px;
|
| 134 |
-
text-align: left;
|
| 135 |
-
}
|
| 136 |
-
</style>
|
| 137 |
-
"""
|
| 138 |
-
|
| 139 |
-
# Insertar CSS en el head
|
| 140 |
-
if soup.head:
|
| 141 |
-
soup.head.insert(0, BeautifulSoup(css_style, 'html.parser'))
|
| 142 |
-
else:
|
| 143 |
-
# Si no hay head, crear uno
|
| 144 |
-
head = soup.new_tag('head')
|
| 145 |
-
head.insert(0, BeautifulSoup(css_style, 'html.parser'))
|
| 146 |
-
if soup.html:
|
| 147 |
-
soup.html.insert(0, head)
|
| 148 |
-
else:
|
| 149 |
-
# Crear estructura HTML completa
|
| 150 |
-
html_tag = soup.new_tag('html')
|
| 151 |
-
html_tag.insert(0, head)
|
| 152 |
-
body = soup.new_tag('body')
|
| 153 |
-
body.extend(soup.contents[:])
|
| 154 |
-
html_tag.append(body)
|
| 155 |
-
soup.clear()
|
| 156 |
-
soup.append(html_tag)
|
| 157 |
-
|
| 158 |
-
return str(soup)
|
| 159 |
-
|
| 160 |
-
def scrape_to_pdf(self, url, filename=None):
|
| 161 |
-
"""Convierte página web a PDF con manejo robusto de errores"""
|
| 162 |
-
try:
|
| 163 |
-
normalized_url = self.normalize_url(url)
|
| 164 |
-
|
| 165 |
-
# Verificar si es imagen
|
| 166 |
-
if self.is_image_url(normalized_url):
|
| 167 |
-
return self._handle_image_to_pdf(normalized_url, filename)
|
| 168 |
-
|
| 169 |
-
# Obtener contenido web
|
| 170 |
-
response = requests.get(normalized_url, headers=self.headers, timeout=30)
|
| 171 |
-
response.raise_for_status()
|
| 172 |
-
response.encoding = response.apparent_encoding or 'utf-8'
|
| 173 |
-
|
| 174 |
-
# Limpiar HTML para PDF
|
| 175 |
-
clean_html = self.get_clean_html_for_pdf(response.text, normalized_url)
|
| 176 |
-
|
| 177 |
-
# Generar nombre de archivo
|
| 178 |
-
if not filename:
|
| 179 |
-
domain = urlparse(normalized_url).netloc.replace('www.', '')
|
| 180 |
-
filename = f"scraped_{domain.replace('.', '_')}.pdf"
|
| 181 |
-
|
| 182 |
-
if not filename.endswith('.pdf'):
|
| 183 |
-
filename += '.pdf'
|
| 184 |
-
|
| 185 |
-
pdf_path = os.path.join(self.output_dir, filename)
|
| 186 |
-
|
| 187 |
-
# Configurar WeasyPrint con opciones robustas
|
| 188 |
-
html_doc = HTML(string=clean_html, base_url=normalized_url)
|
| 189 |
-
|
| 190 |
-
# CSS adicional para mejorar renderizado
|
| 191 |
-
css = CSS(string='''
|
| 192 |
-
@page {
|
| 193 |
-
margin: 2cm;
|
| 194 |
-
size: A4;
|
| 195 |
-
}
|
| 196 |
-
body {
|
| 197 |
-
font-size: 12pt;
|
| 198 |
-
}
|
| 199 |
-
''')
|
| 200 |
-
|
| 201 |
-
html_doc.write_pdf(pdf_path, stylesheets=[css])
|
| 202 |
-
|
| 203 |
-
return {
|
| 204 |
-
'status': 'success',
|
| 205 |
-
'file': pdf_path,
|
| 206 |
-
'url': normalized_url,
|
| 207 |
-
'message': f'PDF generado exitosamente: {filename}'
|
| 208 |
-
}
|
| 209 |
-
|
| 210 |
-
except requests.RequestException as e:
|
| 211 |
-
return {
|
| 212 |
-
'status': 'error',
|
| 213 |
-
'message': f'Error al acceder a la URL: {str(e)}',
|
| 214 |
-
'url': url
|
| 215 |
-
}
|
| 216 |
-
except Exception as e:
|
| 217 |
-
return {
|
| 218 |
-
'status': 'error',
|
| 219 |
-
'message': f'Error al generar PDF: {str(e)}',
|
| 220 |
-
'url': url
|
| 221 |
-
}
|
| 222 |
-
|
| 223 |
-
def scrape_to_text(self, url, filename=None):
|
| 224 |
-
"""Convierte página web a texto plano"""
|
| 225 |
-
try:
|
| 226 |
-
normalized_url = self.normalize_url(url)
|
| 227 |
-
|
| 228 |
-
# Verificar si es imagen
|
| 229 |
-
if self.is_image_url(normalized_url):
|
| 230 |
-
return self._handle_image_to_text(normalized_url, filename)
|
| 231 |
-
|
| 232 |
-
# Obtener contenido web
|
| 233 |
-
response = requests.get(normalized_url, headers=self.headers, timeout=30)
|
| 234 |
-
response.raise_for_status()
|
| 235 |
-
response.encoding = response.apparent_encoding or 'utf-8'
|
| 236 |
-
|
| 237 |
-
# Extraer texto limpio
|
| 238 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
| 239 |
-
|
| 240 |
-
# Remover elementos no deseados
|
| 241 |
-
for element in soup(['script', 'style', 'noscript', 'header', 'footer', 'nav']):
|
| 242 |
-
element.decompose()
|
| 243 |
-
|
| 244 |
-
# Extraer texto con separadores
|
| 245 |
-
text_content = soup.get_text(separator='\n', strip=True)
|
| 246 |
-
|
| 247 |
-
# Limpiar texto
|
| 248 |
-
lines = [line.strip() for line in text_content.split('\n') if line.strip()]
|
| 249 |
-
clean_text = '\n'.join(lines)
|
| 250 |
-
|
| 251 |
-
# Agregar metadatos
|
| 252 |
-
metadata = f"""URL: {normalized_url}
|
| 253 |
-
Fecha de extracción: {requests.utils.default_headers()['User-Agent']}
|
| 254 |
-
Caracteres extraídos: {len(clean_text)}
|
| 255 |
-
|
| 256 |
-
{'='*50}
|
| 257 |
-
|
| 258 |
-
{clean_text}"""
|
| 259 |
-
|
| 260 |
-
# Generar nombre de archivo
|
| 261 |
-
if not filename:
|
| 262 |
-
domain = urlparse(normalized_url).netloc.replace('www.', '')
|
| 263 |
-
filename = f"scraped_{domain.replace('.', '_')}.txt"
|
| 264 |
-
|
| 265 |
-
if not filename.endswith('.txt'):
|
| 266 |
-
filename += '.txt'
|
| 267 |
-
|
| 268 |
-
txt_path = os.path.join(self.output_dir, filename)
|
| 269 |
-
|
| 270 |
-
with open(txt_path, 'w', encoding='utf-8') as f:
|
| 271 |
-
f.write(metadata)
|
| 272 |
-
|
| 273 |
-
return {
|
| 274 |
-
'status': 'success',
|
| 275 |
-
'file': txt_path,
|
| 276 |
-
'url': normalized_url,
|
| 277 |
-
'message': f'Texto extra��do exitosamente: {filename}'
|
| 278 |
-
}
|
| 279 |
-
|
| 280 |
-
except Exception as e:
|
| 281 |
-
return {
|
| 282 |
-
'status': 'error',
|
| 283 |
-
'message': f'Error al extraer texto: {str(e)}',
|
| 284 |
-
'url': url
|
| 285 |
-
}
|
| 286 |
-
|
| 287 |
-
def _handle_image_to_pdf(self, url, filename):
|
| 288 |
-
"""Maneja conversión de imagen a PDF"""
|
| 289 |
-
try:
|
| 290 |
-
response = requests.get(url, headers=self.headers, timeout=30)
|
| 291 |
-
response.raise_for_status()
|
| 292 |
-
|
| 293 |
-
# Crear HTML con la imagen
|
| 294 |
-
html_content = f"""
|
| 295 |
-
<html>
|
| 296 |
-
<head>
|
| 297 |
-
<style>
|
| 298 |
-
body {{ margin: 0; padding: 20px; text-align: center; }}
|
| 299 |
-
img {{ max-width: 100%; height: auto; }}
|
| 300 |
-
.info {{ margin-top: 20px; font-family: Arial, sans-serif; }}
|
| 301 |
-
</style>
|
| 302 |
-
</head>
|
| 303 |
-
<body>
|
| 304 |
-
<img src="{url}" alt="Imagen extraída">
|
| 305 |
-
<div class="info">
|
| 306 |
-
<p><strong>URL:</strong> {url}</p>
|
| 307 |
-
<p><strong>Tipo:</strong> Imagen</p>
|
| 308 |
-
</div>
|
| 309 |
-
</body>
|
| 310 |
-
</html>
|
| 311 |
-
"""
|
| 312 |
-
|
| 313 |
-
if not filename:
|
| 314 |
-
filename = "image_scraped.pdf"
|
| 315 |
-
|
| 316 |
-
pdf_path = os.path.join(self.output_dir, filename)
|
| 317 |
-
HTML(string=html_content).write_pdf(pdf_path)
|
| 318 |
-
|
| 319 |
-
return {
|
| 320 |
-
'status': 'success',
|
| 321 |
-
'file': pdf_path,
|
| 322 |
-
'url': url,
|
| 323 |
-
'message': f'Imagen convertida a PDF: {filename}'
|
| 324 |
-
}
|
| 325 |
-
|
| 326 |
-
except Exception as e:
|
| 327 |
-
return {
|
| 328 |
-
'status': 'error',
|
| 329 |
-
'message': f'Error al procesar imagen: {str(e)}',
|
| 330 |
-
'url': url
|
| 331 |
-
}
|
| 332 |
-
|
| 333 |
-
def _handle_image_to_text(self, url, filename):
|
| 334 |
-
"""Maneja conversión de imagen a archivo de texto con metadatos"""
|
| 335 |
-
try:
|
| 336 |
-
response = requests.get(url, headers=self.headers, timeout=30)
|
| 337 |
-
response.raise_for_status()
|
| 338 |
-
|
| 339 |
-
# Obtener información de la imagen
|
| 340 |
-
try:
|
| 341 |
-
img = Image.open(io.BytesIO(response.content))
|
| 342 |
-
img_info = f"""IMAGEN DETECTADA
|
| 343 |
-
URL: {url}
|
| 344 |
-
Formato: {img.format}
|
| 345 |
-
Dimensiones: {img.size[0]}x{img.size[1]} píxeles
|
| 346 |
-
Modo: {img.mode}
|
| 347 |
-
Tamaño del archivo: {len(response.content)} bytes
|
| 348 |
-
|
| 349 |
-
Esta URL contiene una imagen, no texto extraíble.
|
| 350 |
-
Para procesar el contenido visual, considera usar herramientas de OCR.
|
| 351 |
-
"""
|
| 352 |
-
except:
|
| 353 |
-
img_info = f"""IMAGEN DETECTADA
|
| 354 |
-
URL: {url}
|
| 355 |
-
Tamaño del archivo: {len(response.content)} bytes
|
| 356 |
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
"""
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
from web_scraper_tool import WebScrapperTool
|
| 5 |
+
|
| 6 |
+
# CSS personalizado con estética minimalista profesional
|
| 7 |
+
custom_css = """
|
| 8 |
+
/* Importar fuente Inter */
|
| 9 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
| 10 |
+
|
| 11 |
+
/* Variables globales */
|
| 12 |
+
:root {
|
| 13 |
+
--primary-color: #8b5cf6;
|
| 14 |
+
--primary-hover: #7c3aed;
|
| 15 |
+
--secondary-color: #f8fafc;
|
| 16 |
+
--text-primary: #1e293b;
|
| 17 |
+
--text-secondary: #64748b;
|
| 18 |
+
--border-color: #e2e8f0;
|
| 19 |
+
--success-color: #10b981;
|
| 20 |
+
--error-color: #ef4444;
|
| 21 |
+
--warning-color: #f59e0b;
|
| 22 |
+
--gradient-bg: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
/* Reset y configuración base */
|
| 26 |
+
* {
|
| 27 |
+
box-sizing: border-box;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
body {
|
| 31 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', system-ui, sans-serif;
|
| 32 |
+
background: var(--gradient-bg);
|
| 33 |
+
margin: 0;
|
| 34 |
+
padding: 0;
|
| 35 |
+
min-height: 100vh;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
/* Contenedor principal */
|
| 39 |
+
.gradio-container {
|
| 40 |
+
max-width: 800px !important;
|
| 41 |
+
margin: 0 auto !important;
|
| 42 |
+
padding: 2rem 1rem !important;
|
| 43 |
+
background: rgba(255, 255, 255, 0.95);
|
| 44 |
+
backdrop-filter: blur(10px);
|
| 45 |
+
border-radius: 24px;
|
| 46 |
+
box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.25);
|
| 47 |
+
margin-top: 2rem !important;
|
| 48 |
+
margin-bottom: 2rem !important;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
/* Título principal */
|
| 52 |
+
.gradio-container h1 {
|
| 53 |
+
color: var(--text-primary);
|
| 54 |
+
font-size: 2.5rem;
|
| 55 |
+
font-weight: 700;
|
| 56 |
+
text-align: center;
|
| 57 |
+
margin-bottom: 0.5rem;
|
| 58 |
+
background: linear-gradient(135deg, var(--primary-color), var(--primary-hover));
|
| 59 |
+
-webkit-background-clip: text;
|
| 60 |
+
-webkit-text-fill-color: transparent;
|
| 61 |
+
background-clip: text;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
/* Subtítulo */
|
| 65 |
+
.gradio-container p {
|
| 66 |
+
color: var(--text-secondary);
|
| 67 |
+
font-size: 1.125rem;
|
| 68 |
+
text-align: center;
|
| 69 |
+
margin-bottom: 2rem;
|
| 70 |
+
line-height: 1.6;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
/* Campos de entrada */
|
| 74 |
+
.gr-textbox {
|
| 75 |
+
border: 2px solid var(--border-color) !important;
|
| 76 |
+
border-radius: 12px !important;
|
| 77 |
+
padding: 12px 16px !important;
|
| 78 |
+
font-size: 1rem !important;
|
| 79 |
+
transition: all 0.3s ease !important;
|
| 80 |
+
background: white !important;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
.gr-textbox:focus {
|
| 84 |
+
border-color: var(--primary-color) !important;
|
| 85 |
+
box-shadow: 0 0 0 3px rgba(139, 92, 246, 0.1) !important;
|
| 86 |
+
outline: none !important;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
/* Botones */
|
| 90 |
+
.gr-button {
|
| 91 |
+
background: var(--primary-color) !important;
|
| 92 |
+
color: white !important;
|
| 93 |
+
border: none !important;
|
| 94 |
+
border-radius: 12px !important;
|
| 95 |
+
padding: 12px 24px !important;
|
| 96 |
+
font-size: 1rem !important;
|
| 97 |
+
font-weight: 600 !important;
|
| 98 |
+
cursor: pointer !important;
|
| 99 |
+
transition: all 0.3s ease !important;
|
| 100 |
+
text-transform: none !important;
|
| 101 |
+
letter-spacing: 0.025em !important;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.gr-button:hover {
|
| 105 |
+
background: var(--primary-hover) !important;
|
| 106 |
+
transform: translateY(-2px) !important;
|
| 107 |
+
box-shadow: 0 10px 25px -5px rgba(139, 92, 246, 0.4) !important;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
.gr-button:active {
|
| 111 |
+
transform: translateY(0) !important;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
/* Radio buttons */
|
| 115 |
+
.gr-radio {
|
| 116 |
+
margin: 1rem 0 !important;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.gr-radio label {
|
| 120 |
+
font-weight: 500 !important;
|
| 121 |
+
color: var(--text-primary) !important;
|
| 122 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
+
/* Mensajes de estado */
|
| 125 |
+
.gr-textbox[data-testid="textbox"] {
|
| 126 |
+
font-family: 'Inter', monospace !important;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
/* Área de descarga */
|
| 130 |
+
.gr-file {
|
| 131 |
+
border: 2px dashed var(--border-color) !important;
|
| 132 |
+
border-radius: 12px !important;
|
| 133 |
+
padding: 2rem !important;
|
| 134 |
+
text-align: center !important;
|
| 135 |
+
background: var(--secondary-color) !important;
|
| 136 |
+
transition: all 0.3s ease !important;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
.gr-file:hover {
|
| 140 |
+
border-color: var(--primary-color) !important;
|
| 141 |
+
background: rgba(139, 92, 246, 0.05) !important;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
/* Indicadores de progreso */
|
| 145 |
+
.progress-bar {
|
| 146 |
+
width: 100%;
|
| 147 |
+
height: 6px;
|
| 148 |
+
background: var(--border-color);
|
| 149 |
+
border-radius: 3px;
|
| 150 |
+
overflow: hidden;
|
| 151 |
+
margin: 1rem 0;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
.progress-fill {
|
| 155 |
+
height: 100%;
|
| 156 |
+
background: var(--primary-color);
|
| 157 |
+
border-radius: 3px;
|
| 158 |
+
transition: width 0.3s ease;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
/* Estados de mensajes */
|
| 162 |
+
.success-message {
|
| 163 |
+
background: rgba(16, 185, 129, 0.1) !important;
|
| 164 |
+
border: 1px solid var(--success-color) !important;
|
| 165 |
+
color: var(--success-color) !important;
|
| 166 |
+
border-radius: 8px !important;
|
| 167 |
+
padding: 12px 16px !important;
|
| 168 |
+
margin: 1rem 0 !important;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
.error-message {
|
| 172 |
+
background: rgba(239, 68, 68, 0.1) !important;
|
| 173 |
+
border: 1px solid var(--error-color) !important;
|
| 174 |
+
color: var(--error-color) !important;
|
| 175 |
+
border-radius: 8px !important;
|
| 176 |
+
padding: 12px 16px !important;
|
| 177 |
+
margin: 1rem 0 !important;
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
/* Responsive design */
|
| 181 |
+
@media (max-width: 768px) {
|
| 182 |
+
.gradio-container {
|
| 183 |
+
margin: 1rem !important;
|
| 184 |
+
padding: 1.5rem 1rem !important;
|
| 185 |
+
border-radius: 16px !important;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
.gradio-container h1 {
|
| 189 |
+
font-size: 2rem !important;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
.gradio-container p {
|
| 193 |
+
font-size: 1rem !important;
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
/* Footer */
|
| 198 |
+
.footer {
|
| 199 |
+
text-align: center;
|
| 200 |
+
margin-top: 2rem;
|
| 201 |
+
padding-top: 2rem;
|
| 202 |
+
border-top: 1px solid var(--border-color);
|
| 203 |
+
color: var(--text-secondary);
|
| 204 |
+
font-size: 0.875rem;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
/* Animaciones sutiles */
|
| 208 |
+
@keyframes fadeIn {
|
| 209 |
+
from {
|
| 210 |
+
opacity: 0;
|
| 211 |
+
transform: translateY(20px);
|
| 212 |
+
}
|
| 213 |
+
to {
|
| 214 |
+
opacity: 1;
|
| 215 |
+
transform: translateY(0);
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
.gradio-container > * {
|
| 220 |
+
animation: fadeIn 0.6s ease forwards;
|
| 221 |
+
}
|
| 222 |
"""
|
| 223 |
+
|
| 224 |
+
# Inicializar la herramienta de scraping
|
| 225 |
+
scraper = WebScrapperTool()
|
| 226 |
+
|
| 227 |
+
def validate_url(url):
|
| 228 |
+
"""Valida la URL ingresada"""
|
| 229 |
+
if not url or not url.strip():
|
| 230 |
+
return False, "❌ Por favor ingresa una URL válida"
|
| 231 |
+
|
| 232 |
+
try:
|
| 233 |
+
normalized = scraper.normalize_url(url.strip())
|
| 234 |
+
return True, f"✅ URL válida: {normalized}"
|
| 235 |
+
except Exception as e:
|
| 236 |
+
return False, f"❌ Error en URL: {str(e)}"
|
| 237 |
+
|
| 238 |
+
def process_url(url, format_choice, progress=gr.Progress()):
|
| 239 |
+
"""Procesa la URL y genera el archivo en el formato seleccionado"""
|
| 240 |
+
if not url or not url.strip():
|
| 241 |
+
return "❌ Por favor ingresa una URL válida", None
|
| 242 |
+
|
| 243 |
+
try:
|
| 244 |
+
# Validar URL
|
| 245 |
+
progress(0.1, desc="Validando URL...")
|
| 246 |
+
is_valid, message = validate_url(url)
|
| 247 |
+
if not is_valid:
|
| 248 |
+
return message, None
|
| 249 |
+
|
| 250 |
+
# Normalizar URL
|
| 251 |
+
progress(0.2, desc="Normalizando URL...")
|
| 252 |
+
normalized_url = scraper.normalize_url(url.strip())
|
| 253 |
+
|
| 254 |
+
# Detectar tipo de contenido
|
| 255 |
+
progress(0.3, desc="Detectando tipo de contenido...")
|
| 256 |
+
is_image = scraper.is_image_url(normalized_url)
|
| 257 |
+
content_type = "🖼️ Imagen" if is_image else "📄 Página web"
|
| 258 |
+
|
| 259 |
+
# Procesar según formato seleccionado
|
| 260 |
+
progress(0.5, desc=f"Extrayendo contenido ({format_choice})...")
|
| 261 |
+
|
| 262 |
+
if format_choice == "PDF":
|
| 263 |
+
result = scraper.scrape_to_pdf(normalized_url)
|
| 264 |
+
else: # TXT
|
| 265 |
+
result = scraper.scrape_to_text(normalized_url)
|
| 266 |
+
|
| 267 |
+
progress(0.9, desc="Finalizando...")
|
| 268 |
+
|
| 269 |
+
if result['status'] == 'success':
|
| 270 |
+
progress(1.0, desc="¡Completado!")
|
| 271 |
+
success_msg = f"""✅ **Procesamiento exitoso**
|
| 272 |
+
|
| 273 |
+
🔗 **URL procesada:** {result['url']}
|
| 274 |
+
📁 **Archivo generado:** {os.path.basename(result['file'])}
|
| 275 |
+
📊 **Tipo de contenido:** {content_type}
|
| 276 |
+
📄 **Formato de salida:** {format_choice}
|
| 277 |
+
|
| 278 |
+
💡 **Listo para Copilot:** El archivo está optimizado para ser procesado por Microsoft Copilot"""
|
| 279 |
+
|
| 280 |
+
return success_msg, result['file']
|
| 281 |
+
else:
|
| 282 |
+
error_msg = f"""❌ **Error en el procesamiento**
|
| 283 |
+
|
| 284 |
+
🔗 **URL:** {result.get('url', url)}
|
| 285 |
+
⚠️ **Error:** {result['message']}
|
| 286 |
+
|
| 287 |
+
💡 **Sugerencias:**
|
| 288 |
+
- Verifica que la URL esté accesible
|
| 289 |
+
- Intenta con una URL diferente
|
| 290 |
+
- Algunos sitios pueden bloquear el scraping automático"""
|
| 291 |
+
|
| 292 |
+
return error_msg, None
|
| 293 |
+
|
| 294 |
+
except Exception as e:
|
| 295 |
+
error_msg = f"""❌ **Error inesperado**
|
| 296 |
+
|
| 297 |
+
⚠️ **Error:** {str(e)}
|
| 298 |
+
|
| 299 |
+
💡 **Intenta nuevamente con una URL diferente**"""
|
| 300 |
+
|
| 301 |
+
return error_msg, None
|
| 302 |
+
|
| 303 |
+
# Crear interfaz Gradio
|
| 304 |
+
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="🕸️ Web Scraper Tool") as demo:
|
| 305 |
+
gr.HTML("""
|
| 306 |
+
<div style="text-align: center; margin-bottom: 2rem;">
|
| 307 |
+
<h1>🕸️ Web Scraper Tool</h1>
|
| 308 |
+
<p>Extrae contenido de páginas web y conviértelo a formatos compatibles con Microsoft Copilot</p>
|
| 309 |
+
</div>
|
| 310 |
+
""")
|
| 311 |
+
|
| 312 |
+
with gr.Row():
|
| 313 |
+
with gr.Column(scale=3):
|
| 314 |
+
url_input = gr.Textbox(
|
| 315 |
+
label="🔗 URL de la página web",
|
| 316 |
+
placeholder="https://ejemplo.com o Https://EJEMPLO.com (mayúsculas OK)",
|
| 317 |
+
info="Soporta URLs con cualquier formato de mayúsculas/minúsculas",
|
| 318 |
+
lines=1
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
with gr.Column(scale=1):
|
| 322 |
+
format_choice = gr.Radio(
|
| 323 |
+
choices=["PDF", "TXT"],
|
| 324 |
+
value="TXT",
|
| 325 |
+
label="📄 Formato de salida",
|
| 326 |
+
info="Ambos formatos son compatibles con Copilot"
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
# Botón de validación en tiempo real
|
| 330 |
+
validate_btn = gr.Button("🔍 Validar URL", variant="secondary", size="sm")
|
| 331 |
+
validation_output = gr.Textbox(
|
| 332 |
+
label="Estado de validación",
|
| 333 |
+
interactive=False,
|
| 334 |
+
show_label=False
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
# Botón principal de procesamiento
|
| 338 |
+
process_btn = gr.Button("🚀 Extraer y Convertir", variant="primary", size="lg")
|
| 339 |
+
|
| 340 |
+
# Área de resultados
|
| 341 |
+
with gr.Row():
|
| 342 |
+
with gr.Column(scale=2):
|
| 343 |
+
result_output = gr.Textbox(
|
| 344 |
+
label="📊 Resultado del procesamiento",
|
| 345 |
+
lines=8,
|
| 346 |
+
interactive=False
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
with gr.Column(scale=1):
|
| 350 |
+
file_output = gr.File(
|
| 351 |
+
label="📁 Archivo generado",
|
| 352 |
+
interactive=False
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
# Información adicional
|
| 356 |
+
gr.HTML("""
|
| 357 |
+
<div class="footer">
|
| 358 |
+
<h3>ℹ️ Información de uso</h3>
|
| 359 |
+
<ul style="text-align: left; max-width: 600px; margin: 0 auto;">
|
| 360 |
+
<li><strong>URLs flexibles:</strong> Funciona con cualquier formato (HTTP, HTTPS, con/sin www)</li>
|
| 361 |
+
<li><strong>Detección automática:</strong> Identifica si el contenido es una imagen o texto</li>
|
| 362 |
+
<li><strong>Optimizado para Copilot:</strong> Los archivos generados están formateados para Microsoft Copilot</li>
|
| 363 |
+
<li><strong>Formatos soportados:</strong> PDF (con formato visual) y TXT (texto plano)</li>
|
| 364 |
+
</ul>
|
| 365 |
+
<p style="margin-top: 1rem; color: #64748b;">
|
| 366 |
+
Desarrollado con ❤️ para maximizar la compatibilidad con herramientas de IA
|
| 367 |
+
</p>
|
| 368 |
+
</div>
|
| 369 |
+
""")
|
| 370 |
+
|
| 371 |
+
# Configurar eventos
|
| 372 |
+
validate_btn.click(
|
| 373 |
+
fn=lambda url: validate_url(url)[1],
|
| 374 |
+
inputs=[url_input],
|
| 375 |
+
outputs=[validation_output]
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
process_btn.click(
|
| 379 |
+
fn=process_url,
|
| 380 |
+
inputs=[url_input, format_choice],
|
| 381 |
+
outputs=[result_output, file_output]
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
# Validación automática al cambiar la URL
|
| 385 |
+
url_input.change(
|
| 386 |
+
fn=lambda url: validate_url(url)[1] if url else "",
|
| 387 |
+
inputs=[url_input],
|
| 388 |
+
outputs=[validation_output]
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
# Configuración para Hugging Face Spaces
|
| 392 |
+
if __name__ == "__main__":
|
| 393 |
+
demo.launch(
|
| 394 |
+
server_name="0.0.0.0",
|
| 395 |
+
server_port=7860,
|
| 396 |
+
show_error=True,
|
| 397 |
+
share=False
|
| 398 |
+
)
|
requirements.txt
CHANGED
|
@@ -1,7 +1,29 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
| 2 |
requests==2.31.0
|
| 3 |
-
beautifulsoup4==4.12.
|
| 4 |
-
weasyprint==60.2
|
| 5 |
-
Pillow==10.0.0
|
| 6 |
lxml==4.9.3
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Gradio framework - versión estable compatible con HF Spaces
|
| 2 |
+
gradio==4.44.1
|
| 3 |
+
|
| 4 |
+
# Web scraping y parsing HTML
|
| 5 |
requests==2.31.0
|
| 6 |
+
beautifulsoup4==4.12.3
|
|
|
|
|
|
|
| 7 |
lxml==4.9.3
|
| 8 |
+
|
| 9 |
+
# Conversión HTML a PDF - versión estable
|
| 10 |
+
weasyprint==60.2
|
| 11 |
+
|
| 12 |
+
# Manejo de imágenes
|
| 13 |
+
Pillow==10.0.1
|
| 14 |
+
|
| 15 |
+
# Dependencias específicas para WeasyPrint
|
| 16 |
+
cffi==1.16.0
|
| 17 |
+
pycparser==2.21
|
| 18 |
+
cssselect2==0.7.0
|
| 19 |
+
tinycss2==1.2.1
|
| 20 |
+
webencodings==0.5.1
|
| 21 |
+
|
| 22 |
+
# Dependencias de red compatibles
|
| 23 |
+
urllib3==2.0.7
|
| 24 |
+
certifi==2023.7.22
|
| 25 |
+
charset-normalizer==3.3.2
|
| 26 |
+
idna==3.4
|
| 27 |
+
|
| 28 |
+
# Utilidades adicionales
|
| 29 |
+
python-dateutil==2.8.2
|
test_app.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
def test_function(text):
|
| 4 |
+
"""Función de prueba simple"""
|
| 5 |
+
return f"✅ Funciona! Recibido: {text}"
|
| 6 |
+
|
| 7 |
+
# Crear interfaz de prueba
|
| 8 |
+
with gr.Blocks(title="Test App") as demo:
|
| 9 |
+
gr.HTML("<h1>🧪 Test de Funcionamiento</h1>")
|
| 10 |
+
|
| 11 |
+
with gr.Row():
|
| 12 |
+
input_text = gr.Textbox(label="Texto de prueba", placeholder="Escribe algo...")
|
| 13 |
+
output_text = gr.Textbox(label="Resultado", interactive=False)
|
| 14 |
+
|
| 15 |
+
btn = gr.Button("Probar", variant="primary")
|
| 16 |
+
btn.click(fn=test_function, inputs=[input_text], outputs=[output_text])
|
| 17 |
+
|
| 18 |
+
if __name__ == "__main__":
|
| 19 |
+
demo.launch()
|
web_scraper_tool.py
CHANGED
|
@@ -1,201 +1,416 @@
|
|
|
|
|
| 1 |
import requests
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
-
import os
|
| 4 |
from weasyprint import HTML, CSS
|
| 5 |
-
from
|
| 6 |
-
from io import BytesIO
|
| 7 |
import re
|
|
|
|
|
|
|
| 8 |
import random
|
| 9 |
-
import mimetypes
|
| 10 |
-
import json
|
| 11 |
-
import time
|
| 12 |
|
| 13 |
class WebScrapperTool:
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def __init__(self, output_dir):
|
| 17 |
-
"""Inicializa la herramienta
|
| 18 |
-
|
| 19 |
-
Args:
|
| 20 |
-
output_dir: Directorio donde se guardarán los archivos
|
| 21 |
-
"""
|
| 22 |
self.output_dir = output_dir
|
| 23 |
-
self.session = self._create_session()
|
| 24 |
-
|
| 25 |
-
# Crear directorio de salida si no existe
|
| 26 |
if not os.path.exists(output_dir):
|
| 27 |
os.makedirs(output_dir)
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
session = requests.Session()
|
| 32 |
-
|
| 33 |
-
# Lista de user agents comunes
|
| 34 |
-
user_agents = [
|
| 35 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 36 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/
|
| 37 |
-
'Mozilla/5.0 (
|
| 38 |
-
'Mozilla/5.0 (
|
| 39 |
-
'Mozilla/5.0 (
|
| 40 |
]
|
| 41 |
|
| 42 |
-
|
| 43 |
-
headers
|
| 44 |
-
|
|
|
|
| 45 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 46 |
'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
|
| 47 |
-
'
|
| 48 |
-
'DNT': '1',
|
|
|
|
|
|
|
| 49 |
}
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
| 55 |
-
"""Verifica si una URL es una imagen basándose en la extensión y/o Content-Type
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
return True
|
| 67 |
|
| 68 |
-
# Verificar por
|
| 69 |
try:
|
| 70 |
-
response =
|
| 71 |
-
content_type = response.headers.get('
|
| 72 |
-
|
|
|
|
| 73 |
except:
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
|
|
|
| 79 |
|
| 80 |
-
|
| 81 |
-
url: URL de la imagen
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
"""
|
| 86 |
try:
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
response.raise_for_status()
|
|
|
|
| 90 |
|
| 91 |
-
#
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
#
|
| 99 |
try:
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
except Exception as e:
|
| 109 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
-
def scrape_to_text(self, url,
|
| 112 |
-
"""
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
"""
|
| 121 |
-
try:
|
| 122 |
-
# Obtener contenido de la página
|
| 123 |
-
response = self.session.get(url, timeout=15)
|
| 124 |
response.raise_for_status()
|
|
|
|
| 125 |
|
| 126 |
-
#
|
| 127 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 128 |
|
| 129 |
-
#
|
| 130 |
-
for element in soup(['script', 'style', '
|
| 131 |
-
element.
|
| 132 |
|
| 133 |
-
#
|
| 134 |
-
|
| 135 |
|
| 136 |
-
# Limpiar
|
| 137 |
-
lines = [line.strip() for line in
|
| 138 |
-
|
| 139 |
|
| 140 |
-
#
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
|
| 146 |
-
with open(output_path, 'w', encoding='utf-8') as f:
|
| 147 |
-
f.write(f"URL: {url}\n\n")
|
| 148 |
-
f.write(text)
|
| 149 |
|
| 150 |
-
|
| 151 |
-
except Exception as e:
|
| 152 |
-
raise Exception(f"Error al hacer scraping a texto: {str(e)}")
|
| 153 |
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
output_path: Ruta donde guardar el archivo PDF
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
try:
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
}
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
height: auto;
|
| 190 |
-
}
|
| 191 |
"""
|
| 192 |
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
-
return output_path
|
| 200 |
except Exception as e:
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
|
|
|
| 4 |
from weasyprint import HTML, CSS
|
| 5 |
+
from urllib.parse import urlparse, urlunparse
|
|
|
|
| 6 |
import re
|
| 7 |
+
from PIL import Image
|
| 8 |
+
import io
|
| 9 |
import random
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
class WebScrapperTool:
|
| 12 |
+
def __init__(self, output_dir="output"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
self.output_dir = output_dir
|
|
|
|
|
|
|
|
|
|
| 14 |
if not os.path.exists(output_dir):
|
| 15 |
os.makedirs(output_dir)
|
| 16 |
|
| 17 |
+
# Múltiples user agents para evitar bloqueos
|
| 18 |
+
self.user_agents = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 20 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 21 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 22 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
|
| 23 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
|
| 24 |
]
|
| 25 |
|
| 26 |
+
def get_headers(self):
|
| 27 |
+
"""Genera headers dinámicos para evitar detección"""
|
| 28 |
+
return {
|
| 29 |
+
'User-Agent': random.choice(self.user_agents),
|
| 30 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 31 |
'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
|
| 32 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 33 |
+
'DNT': '1',
|
| 34 |
+
'Connection': 'keep-alive',
|
| 35 |
+
'Upgrade-Insecure-Requests': '1'
|
| 36 |
}
|
| 37 |
|
| 38 |
+
def normalize_url(self, url):
|
| 39 |
+
"""Normaliza URLs manejando todos los casos de mayúsculas y formatos incorrectos"""
|
| 40 |
+
if not url:
|
| 41 |
+
raise ValueError("URL no puede estar vacía")
|
| 42 |
|
| 43 |
+
url = url.strip()
|
|
|
|
| 44 |
|
| 45 |
+
# Convertir esquemas a minúsculas pero mantener el resto
|
| 46 |
+
if url.lower().startswith('http://'):
|
| 47 |
+
url = 'http://' + url[7:]
|
| 48 |
+
elif url.lower().startswith('https://'):
|
| 49 |
+
url = 'https://' + url[8:]
|
| 50 |
+
elif not url.startswith(('http://', 'https://')):
|
| 51 |
+
# Si no tiene esquema, agregar https por defecto
|
| 52 |
+
url = 'https://' + url
|
| 53 |
|
| 54 |
+
try:
|
| 55 |
+
parsed = urlparse(url)
|
| 56 |
+
|
| 57 |
+
# Normalizar componentes
|
| 58 |
+
scheme = parsed.scheme.lower()
|
| 59 |
+
netloc = parsed.netloc.lower() if parsed.netloc else ''
|
| 60 |
+
path = parsed.path
|
| 61 |
+
params = parsed.params
|
| 62 |
+
query = parsed.query
|
| 63 |
+
fragment = parsed.fragment
|
| 64 |
+
|
| 65 |
+
# Si netloc está vacío pero hay path, intentar corregir
|
| 66 |
+
if not netloc and path:
|
| 67 |
+
parts = path.split('/', 1)
|
| 68 |
+
netloc = parts[0].lower()
|
| 69 |
+
path = '/' + parts[1] if len(parts) > 1 else ''
|
| 70 |
+
|
| 71 |
+
normalized_url = urlunparse((scheme, netloc, path, params, query, fragment))
|
| 72 |
+
return normalized_url
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
raise ValueError(f"URL inválida: {url}. Error: {str(e)}")
|
| 76 |
+
|
| 77 |
+
def is_image_url(self, url):
|
| 78 |
+
"""Detecta si una URL es una imagen"""
|
| 79 |
+
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.tiff', '.ico'}
|
| 80 |
+
|
| 81 |
+
# Verificar por extensión
|
| 82 |
+
parsed_url = urlparse(url.lower())
|
| 83 |
+
path = parsed_url.path
|
| 84 |
+
if any(path.endswith(ext) for ext in image_extensions):
|
| 85 |
return True
|
| 86 |
|
| 87 |
+
# Verificar por content-type si es posible
|
| 88 |
try:
|
| 89 |
+
response = requests.head(url, headers=self.get_headers(), timeout=10)
|
| 90 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 91 |
+
if content_type.startswith('image/'):
|
| 92 |
+
return True
|
| 93 |
except:
|
| 94 |
+
pass
|
| 95 |
+
|
| 96 |
+
return False
|
| 97 |
+
|
| 98 |
+
def get_clean_html_for_pdf(self, html_content, base_url):
|
| 99 |
+
"""Limpia HTML específicamente para conversión PDF robusta"""
|
| 100 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 101 |
+
|
| 102 |
+
# Remover elementos problemáticos para PDF
|
| 103 |
+
for element in soup(['script', 'style', 'noscript', 'iframe', 'embed', 'object', 'form']):
|
| 104 |
+
element.decompose()
|
| 105 |
+
|
| 106 |
+
# Remover atributos problemáticos
|
| 107 |
+
for tag in soup.find_all():
|
| 108 |
+
# Mantener solo atributos seguros
|
| 109 |
+
safe_attrs = ['href', 'src', 'alt', 'title', 'class', 'id']
|
| 110 |
+
attrs_to_remove = [attr for attr in tag.attrs if attr not in safe_attrs]
|
| 111 |
+
for attr in attrs_to_remove:
|
| 112 |
+
del tag[attr]
|
| 113 |
+
|
| 114 |
+
# Crear estructura HTML completa si no existe
|
| 115 |
+
if not soup.html:
|
| 116 |
+
new_soup = BeautifulSoup('<!DOCTYPE html><html><head></head><body></body></html>', 'html.parser')
|
| 117 |
+
new_soup.body.extend(soup.contents[:])
|
| 118 |
+
soup = new_soup
|
| 119 |
+
|
| 120 |
+
# Agregar CSS básico para mejor renderizado PDF
|
| 121 |
+
css_style = soup.new_tag('style')
|
| 122 |
+
css_style.string = """
|
| 123 |
+
body {
|
| 124 |
+
font-family: Arial, sans-serif;
|
| 125 |
+
line-height: 1.6;
|
| 126 |
+
margin: 20px;
|
| 127 |
+
color: #333;
|
| 128 |
+
max-width: 800px;
|
| 129 |
+
}
|
| 130 |
+
h1, h2, h3, h4, h5, h6 {
|
| 131 |
+
color: #2c3e50;
|
| 132 |
+
margin-top: 20px;
|
| 133 |
+
page-break-after: avoid;
|
| 134 |
+
}
|
| 135 |
+
p {
|
| 136 |
+
margin-bottom: 10px;
|
| 137 |
+
text-align: justify;
|
| 138 |
+
}
|
| 139 |
+
a {
|
| 140 |
+
color: #3498db;
|
| 141 |
+
text-decoration: none;
|
| 142 |
+
}
|
| 143 |
+
img {
|
| 144 |
+
max-width: 100%;
|
| 145 |
+
height: auto;
|
| 146 |
+
page-break-inside: avoid;
|
| 147 |
+
}
|
| 148 |
+
table {
|
| 149 |
+
border-collapse: collapse;
|
| 150 |
+
width: 100%;
|
| 151 |
+
page-break-inside: avoid;
|
| 152 |
+
}
|
| 153 |
+
th, td {
|
| 154 |
+
border: 1px solid #ddd;
|
| 155 |
+
padding: 8px;
|
| 156 |
+
text-align: left;
|
| 157 |
+
}
|
| 158 |
+
@page {
|
| 159 |
+
margin: 2cm;
|
| 160 |
+
@bottom-center {
|
| 161 |
+
content: "Página " counter(page);
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
"""
|
| 165 |
|
| 166 |
+
# Insertar CSS en el head
|
| 167 |
+
if soup.head:
|
| 168 |
+
soup.head.append(css_style)
|
| 169 |
|
| 170 |
+
return str(soup)
|
|
|
|
| 171 |
|
| 172 |
+
def scrape_to_pdf(self, url, filename=None):
|
| 173 |
+
"""Convierte página web a PDF con manejo robusto de errores"""
|
|
|
|
| 174 |
try:
|
| 175 |
+
normalized_url = self.normalize_url(url)
|
| 176 |
+
|
| 177 |
+
# Verificar si es imagen
|
| 178 |
+
if self.is_image_url(normalized_url):
|
| 179 |
+
return self._handle_image_to_pdf(normalized_url, filename)
|
| 180 |
+
|
| 181 |
+
# Obtener contenido web
|
| 182 |
+
response = requests.get(normalized_url, headers=self.get_headers(), timeout=30)
|
| 183 |
response.raise_for_status()
|
| 184 |
+
response.encoding = response.apparent_encoding or 'utf-8'
|
| 185 |
|
| 186 |
+
# Limpiar HTML para PDF
|
| 187 |
+
clean_html = self.get_clean_html_for_pdf(response.text, normalized_url)
|
| 188 |
+
|
| 189 |
+
# Generar nombre de archivo
|
| 190 |
+
if not filename:
|
| 191 |
+
domain = urlparse(normalized_url).netloc.replace('www.', '')
|
| 192 |
+
domain_clean = re.sub(r'[^a-zA-Z0-9_-]', '_', domain)
|
| 193 |
+
filename = f"scraped_{domain_clean}.pdf"
|
| 194 |
+
|
| 195 |
+
if not filename.endswith('.pdf'):
|
| 196 |
+
filename += '.pdf'
|
| 197 |
+
|
| 198 |
+
pdf_path = os.path.join(self.output_dir, filename)
|
| 199 |
|
| 200 |
+
# Configurar WeasyPrint con opciones robustas
|
| 201 |
try:
|
| 202 |
+
html_doc = HTML(string=clean_html, base_url=normalized_url)
|
| 203 |
+
html_doc.write_pdf(pdf_path)
|
| 204 |
+
except Exception as weasy_error:
|
| 205 |
+
# Fallback: usar HTML más simple
|
| 206 |
+
simple_html = f"""
|
| 207 |
+
<!DOCTYPE html>
|
| 208 |
+
<html>
|
| 209 |
+
<head>
|
| 210 |
+
<meta charset="utf-8">
|
| 211 |
+
<title>Web Scraping Result</title>
|
| 212 |
+
<style>
|
| 213 |
+
body {{ font-family: Arial, sans-serif; margin: 20px; line-height: 1.6; }}
|
| 214 |
+
.header {{ background-color: #f8f9fa; padding: 10px; margin-bottom: 20px; }}
|
| 215 |
+
.content {{ max-width: 800px; }}
|
| 216 |
+
</style>
|
| 217 |
+
</head>
|
| 218 |
+
<body>
|
| 219 |
+
<div class="header">
|
| 220 |
+
<h1>Contenido Web Extraído</h1>
|
| 221 |
+
<p><strong>URL:</strong> {normalized_url}</p>
|
| 222 |
+
</div>
|
| 223 |
+
<div class="content">
|
| 224 |
+
{BeautifulSoup(response.text, 'html.parser').get_text()}
|
| 225 |
+
</div>
|
| 226 |
+
</body>
|
| 227 |
+
</html>
|
| 228 |
+
"""
|
| 229 |
+
html_doc = HTML(string=simple_html)
|
| 230 |
+
html_doc.write_pdf(pdf_path)
|
| 231 |
+
|
| 232 |
+
return {
|
| 233 |
+
'status': 'success',
|
| 234 |
+
'file': pdf_path,
|
| 235 |
+
'url': normalized_url,
|
| 236 |
+
'message': f'PDF generado exitosamente: {filename}'
|
| 237 |
+
}
|
| 238 |
|
| 239 |
+
except requests.RequestException as e:
|
| 240 |
+
return {
|
| 241 |
+
'status': 'error',
|
| 242 |
+
'message': f'Error al acceder a la URL: {str(e)}',
|
| 243 |
+
'url': url
|
| 244 |
+
}
|
| 245 |
except Exception as e:
|
| 246 |
+
return {
|
| 247 |
+
'status': 'error',
|
| 248 |
+
'message': f'Error al generar PDF: {str(e)}',
|
| 249 |
+
'url': url
|
| 250 |
+
}
|
| 251 |
|
| 252 |
+
def scrape_to_text(self, url, filename=None):
|
| 253 |
+
"""Convierte página web a texto plano"""
|
| 254 |
+
try:
|
| 255 |
+
normalized_url = self.normalize_url(url)
|
| 256 |
|
| 257 |
+
# Verificar si es imagen
|
| 258 |
+
if self.is_image_url(normalized_url):
|
| 259 |
+
return self._handle_image_to_text(normalized_url, filename)
|
| 260 |
|
| 261 |
+
# Obtener contenido web
|
| 262 |
+
response = requests.get(normalized_url, headers=self.get_headers(), timeout=30)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
response.raise_for_status()
|
| 264 |
+
response.encoding = response.apparent_encoding or 'utf-8'
|
| 265 |
|
| 266 |
+
# Extraer texto limpio
|
| 267 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 268 |
|
| 269 |
+
# Remover elementos no deseados
|
| 270 |
+
for element in soup(['script', 'style', 'noscript', 'header', 'footer', 'nav', 'aside']):
|
| 271 |
+
element.decompose()
|
| 272 |
|
| 273 |
+
# Extraer texto con separadores
|
| 274 |
+
text_content = soup.get_text(separator='\n', strip=True)
|
| 275 |
|
| 276 |
+
# Limpiar texto
|
| 277 |
+
lines = [line.strip() for line in text_content.split('\n') if line.strip()]
|
| 278 |
+
clean_text = '\n'.join(lines)
|
| 279 |
|
| 280 |
+
# Agregar metadatos
|
| 281 |
+
from datetime import datetime
|
| 282 |
+
metadata = f"""CONTENIDO WEB EXTRAÍDO
|
| 283 |
+
URL: {normalized_url}
|
| 284 |
+
Fecha de extracción: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 285 |
+
Caracteres extraídos: {len(clean_text)}
|
| 286 |
+
Tipo de contenido: {'Imagen' if self.is_image_url(normalized_url) else 'Texto'}
|
| 287 |
|
| 288 |
+
{'='*50}
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
+
{clean_text}"""
|
|
|
|
|
|
|
| 291 |
|
| 292 |
+
# Generar nombre de archivo
|
| 293 |
+
if not filename:
|
| 294 |
+
domain = urlparse(normalized_url).netloc.replace('www.', '')
|
| 295 |
+
domain_clean = re.sub(r'[^a-zA-Z0-9_-]', '_', domain)
|
| 296 |
+
filename = f"scraped_{domain_clean}.txt"
|
| 297 |
|
| 298 |
+
if not filename.endswith('.txt'):
|
| 299 |
+
filename += '.txt'
|
|
|
|
| 300 |
|
| 301 |
+
txt_path = os.path.join(self.output_dir, filename)
|
| 302 |
+
|
| 303 |
+
with open(txt_path, 'w', encoding='utf-8') as f:
|
| 304 |
+
f.write(metadata)
|
| 305 |
+
|
| 306 |
+
return {
|
| 307 |
+
'status': 'success',
|
| 308 |
+
'file': txt_path,
|
| 309 |
+
'url': normalized_url,
|
| 310 |
+
'message': f'Texto extraído exitosamente: {filename}'
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
except Exception as e:
|
| 314 |
+
return {
|
| 315 |
+
'status': 'error',
|
| 316 |
+
'message': f'Error al extraer texto: {str(e)}',
|
| 317 |
+
'url': url
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
def _handle_image_to_pdf(self, url, filename):
|
| 321 |
+
"""Maneja conversión de imagen a PDF"""
|
| 322 |
try:
|
| 323 |
+
response = requests.get(url, headers=self.get_headers(), timeout=30)
|
| 324 |
+
response.raise_for_status()
|
| 325 |
+
|
| 326 |
+
# Crear HTML con la imagen
|
| 327 |
+
html_content = f"""
|
| 328 |
+
<!DOCTYPE html>
|
| 329 |
+
<html>
|
| 330 |
+
<head>
|
| 331 |
+
<meta charset="utf-8">
|
| 332 |
+
<style>
|
| 333 |
+
body {{ margin: 0; padding: 20px; text-align: center; font-family: Arial, sans-serif; }}
|
| 334 |
+
img {{ max-width: 100%; height: auto; }}
|
| 335 |
+
.info {{ margin-top: 20px; }}
|
| 336 |
+
</style>
|
| 337 |
+
</head>
|
| 338 |
+
<body>
|
| 339 |
+
<div class="info">
|
| 340 |
+
<h1>Imagen Extraída</h1>
|
| 341 |
+
<p><strong>URL:</strong> {url}</p>
|
| 342 |
+
<p><strong>Tipo:</strong> Imagen</p>
|
| 343 |
+
</div>
|
| 344 |
+
<img src="{url}" alt="Imagen extraída">
|
| 345 |
+
</body>
|
| 346 |
+
</html>
|
|
|
|
|
|
|
| 347 |
"""
|
| 348 |
|
| 349 |
+
if not filename:
|
| 350 |
+
filename = "image_scraped.pdf"
|
| 351 |
+
|
| 352 |
+
pdf_path = os.path.join(self.output_dir, filename)
|
| 353 |
+
HTML(string=html_content).write_pdf(pdf_path)
|
| 354 |
+
|
| 355 |
+
return {
|
| 356 |
+
'status': 'success',
|
| 357 |
+
'file': pdf_path,
|
| 358 |
+
'url': url,
|
| 359 |
+
'message': f'Imagen convertida a PDF: {filename}'
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
except Exception as e:
|
| 363 |
+
return {
|
| 364 |
+
'status': 'error',
|
| 365 |
+
'message': f'Error al procesar imagen: {str(e)}',
|
| 366 |
+
'url': url
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
def _handle_image_to_text(self, url, filename):
|
| 370 |
+
"""Maneja conversión de imagen a archivo de texto con metadatos"""
|
| 371 |
+
try:
|
| 372 |
+
response = requests.get(url, headers=self.get_headers(), timeout=30)
|
| 373 |
+
response.raise_for_status()
|
| 374 |
+
|
| 375 |
+
# Obtener información de la imagen
|
| 376 |
+
try:
|
| 377 |
+
img = Image.open(io.BytesIO(response.content))
|
| 378 |
+
img_info = f"""IMAGEN DETECTADA
|
| 379 |
+
URL: {url}
|
| 380 |
+
Formato: {img.format}
|
| 381 |
+
Dimensiones: {img.size[0]}x{img.size[1]} píxeles
|
| 382 |
+
Modo: {img.mode}
|
| 383 |
+
Tamaño del archivo: {len(response.content)} bytes
|
| 384 |
+
|
| 385 |
+
Esta URL contiene una imagen, no texto extraíble.
|
| 386 |
+
Para procesar el contenido visual, considera usar herramientas de OCR.
|
| 387 |
+
"""
|
| 388 |
+
except:
|
| 389 |
+
img_info = f"""IMAGEN DETECTADA
|
| 390 |
+
URL: {url}
|
| 391 |
+
Tamaño del archivo: {len(response.content)} bytes
|
| 392 |
+
|
| 393 |
+
Esta URL contiene una imagen, no texto extraíble.
|
| 394 |
+
"""
|
| 395 |
+
|
| 396 |
+
if not filename:
|
| 397 |
+
filename = "image_info.txt"
|
| 398 |
+
|
| 399 |
+
txt_path = os.path.join(self.output_dir, filename)
|
| 400 |
+
|
| 401 |
+
with open(txt_path, 'w', encoding='utf-8') as f:
|
| 402 |
+
f.write(img_info)
|
| 403 |
+
|
| 404 |
+
return {
|
| 405 |
+
'status': 'success',
|
| 406 |
+
'file': txt_path,
|
| 407 |
+
'url': url,
|
| 408 |
+
'message': f'Información de imagen guardada: {filename}'
|
| 409 |
+
}
|
| 410 |
|
|
|
|
| 411 |
except Exception as e:
|
| 412 |
+
return {
|
| 413 |
+
'status': 'error',
|
| 414 |
+
'message': f'Error al procesar imagen: {str(e)}',
|
| 415 |
+
'url': url
|
| 416 |
+
}
|