Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,120 +1,120 @@
|
|
| 1 |
-
import
|
| 2 |
import requests
|
| 3 |
-
from flask import Flask, render_template, request, flash, redirect, url_for
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
-
import pandas as pd
|
| 6 |
import time
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"""
|
| 16 |
-
|
| 17 |
-
|
| 18 |
"""
|
| 19 |
-
|
| 20 |
-
|
| 21 |
|
| 22 |
-
#
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
try:
|
| 30 |
-
#
|
| 31 |
-
|
| 32 |
response = requests.get(url, headers=headers, timeout=15)
|
| 33 |
-
|
| 34 |
-
response.raise_for_status()
|
| 35 |
-
|
| 36 |
-
# --- Parseo del HTML con BeautifulSoup ---
|
| 37 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 38 |
|
| 39 |
-
# Encontrar
|
| 40 |
-
# Se
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
for
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
# --- Punto de Entrada Principal ---
|
| 117 |
-
if __name__ == '__main__':
|
| 118 |
-
# Inicia el servidor de desarrollo de Flask
|
| 119 |
-
# El debug=True permite ver los errores en el navegador y recarga el servidor automáticamente
|
| 120 |
-
app.run(debug=True)
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
import requests
|
|
|
|
| 3 |
from bs4 import BeautifulSoup
|
|
|
|
| 4 |
import time
|
| 5 |
+
import random
|
| 6 |
+
from urllib.parse import urljoin
|
| 7 |
+
|
| 8 |
+
# --- Técnicas Anti-Scraping ---
|
| 9 |
+
USER_AGENTS = [
|
| 10 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
| 11 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
|
| 12 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
def get_random_user_agent():
|
| 16 |
+
""" Devuelve un User-Agent al azar. """
|
| 17 |
+
return random.choice(USER_AGENTS)
|
| 18 |
+
|
| 19 |
+
# --- Función Principal de Scraping ---
|
| 20 |
+
def scrape_website(url, max_links_str):
|
| 21 |
"""
|
| 22 |
+
Scrapea la URL, entra en cada link de detalle, extrae el contenido de las tablas
|
| 23 |
+
y devuelve un archivo de texto para descargar.
|
| 24 |
"""
|
| 25 |
+
if not url.startswith('http'):
|
| 26 |
+
url = 'https://' + url
|
| 27 |
|
| 28 |
+
# Convertir el número máximo de links a entero, con un valor por defecto
|
| 29 |
+
try:
|
| 30 |
+
max_links = int(max_links_str)
|
| 31 |
+
except (ValueError, TypeError):
|
| 32 |
+
max_links = 10 # Valor por defecto si la entrada no es válida
|
| 33 |
+
|
| 34 |
+
links_to_visit = set()
|
| 35 |
+
all_content = f"Resultados del scraping para: {url}\n"
|
| 36 |
+
all_content += "========================================\n\n"
|
| 37 |
|
| 38 |
try:
|
| 39 |
+
# 1. Petición a la URL principal
|
| 40 |
+
headers = {'User-Agent': get_random_user_agent()}
|
| 41 |
response = requests.get(url, headers=headers, timeout=15)
|
| 42 |
+
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
| 43 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 44 |
|
| 45 |
+
# 2. Encontrar todos los links que parecen ser de audiencias
|
| 46 |
+
# Se busca un patrón específico para ser más preciso
|
| 47 |
+
for a_tag in soup.find_all('a', href=True):
|
| 48 |
+
link = a_tag['href']
|
| 49 |
+
# Usamos urljoin para construir correctamente la URL absoluta
|
| 50 |
+
full_link = urljoin(url, link)
|
| 51 |
+
# Filtramos para quedarnos solo con los links de audiencias del mismo sitio
|
| 52 |
+
if url in full_link and '/audiencias/' in full_link:
|
| 53 |
+
links_to_visit.add(full_link)
|
| 54 |
+
|
| 55 |
+
all_content += f"Se encontraron {len(links_to_visit)} links de audiencias para visitar.\n"
|
| 56 |
+
all_content += f"Procesando los primeros {min(len(links_to_visit), max_links)} links...\n\n"
|
| 57 |
+
|
| 58 |
+
# 3. Visitar cada link y extraer el contenido de la tabla
|
| 59 |
+
for i, link in enumerate(list(links_to_visit)[:max_links]):
|
| 60 |
+
try:
|
| 61 |
+
time.sleep(random.uniform(1, 2.5)) # Pausa respetuosa
|
| 62 |
+
headers = {'User-Agent': get_random_user_agent()}
|
| 63 |
+
detail_response = requests.get(link, headers=headers, timeout=10)
|
| 64 |
+
detail_response.raise_for_status()
|
| 65 |
+
detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
|
| 66 |
+
|
| 67 |
+
title = detail_soup.find('title').get_text(strip=True) if detail_soup.find('title') else "Sin título"
|
| 68 |
+
all_content += f"--- Contenido de: {link} ---\n"
|
| 69 |
+
all_content += f"Título: {title}\n\n"
|
| 70 |
+
|
| 71 |
+
# Buscar la tabla de detalles (inspeccionando la página, vemos que tiene la clase 'table') [4, 5]
|
| 72 |
+
table = detail_soup.find('table', class_='table')
|
| 73 |
+
if table:
|
| 74 |
+
# Extraer todas las filas de la tabla [1]
|
| 75 |
+
rows = table.find_all('tr')
|
| 76 |
+
for row in rows:
|
| 77 |
+
# Extraer las celdas de cabecera (th) y datos (td)
|
| 78 |
+
cols = row.find_all(['th', 'td'])
|
| 79 |
+
# Limpiar y unir el texto de las celdas
|
| 80 |
+
cleaned_cols = [ele.text.strip() for ele in cols]
|
| 81 |
+
all_content += " | ".join(cleaned_cols) + "\n"
|
| 82 |
+
else:
|
| 83 |
+
all_content += "No se encontró una tabla de detalles en esta página.\n"
|
| 84 |
+
|
| 85 |
+
all_content += "\n----------------------------------------\n\n"
|
| 86 |
+
|
| 87 |
+
except requests.RequestException as e:
|
| 88 |
+
all_content += f"Error al visitar {link}: {e}\n\n"
|
| 89 |
+
|
| 90 |
+
except requests.RequestException as e:
|
| 91 |
+
return f"Error al acceder a la URL principal: {e}", None # Devuelve dos valores
|
| 92 |
+
|
| 93 |
+
# 4. Crear el archivo de texto y devolverlo
|
| 94 |
+
# Gradio maneja la creación del archivo temporal automáticamente [7, 8]
|
| 95 |
+
file_path = "resultados_scraping.txt"
|
| 96 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 97 |
+
f.write(all_content)
|
| 98 |
+
|
| 99 |
+
# Devolvemos un mensaje de éxito y la ruta del archivo para la descarga
|
| 100 |
+
return f"¡Proceso completado! Se procesaron {min(len(links_to_visit), max_links)} links. Descarga el archivo para ver los resultados.", file_path
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# --- Interfaz con Gradio ---
|
| 104 |
+
iface = gr.Interface(
|
| 105 |
+
fn=scrape_website,
|
| 106 |
+
inputs=[
|
| 107 |
+
gr.Textbox(lines=1, placeholder="Ingresa una URL (ej. leylobby.gob.cl/...)"),
|
| 108 |
+
gr.Textbox(value="10", label="Número máximo de links a visitar")
|
| 109 |
+
],
|
| 110 |
+
outputs=[
|
| 111 |
+
gr.Textbox(label="Estado del Proceso"),
|
| 112 |
+
gr.File(label="Descargar Resultados (.txt)") # Componente de descarga de archivo [7, 9]
|
| 113 |
+
],
|
| 114 |
+
title="🤖 Web Scraper Pro v2",
|
| 115 |
+
description="Ingresa una URL para extraer el contenido de los links de detalle. El resultado se genera en un archivo .txt descargable. ¡Ideal para análisis de datos!",
|
| 116 |
+
allow_flagging="never"
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# ¡Lanzamos la app!
|
| 120 |
+
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|