Update app.py
Browse files
app.py
CHANGED
|
@@ -13,6 +13,7 @@ import os
|
|
| 13 |
import json
|
| 14 |
import gradio as gr
|
| 15 |
from playwright.async_api import async_playwright
|
|
|
|
| 16 |
|
| 17 |
USER_AGENT = (
|
| 18 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
@@ -20,6 +21,14 @@ USER_AGENT = (
|
|
| 20 |
"Chrome/91.0.4472.124 Safari/537.36"
|
| 21 |
)
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
async def scrape_images(url, max_imgs):
|
| 24 |
max_imgs = max(10, min(max_imgs, 300))
|
| 25 |
async with async_playwright() as p:
|
|
@@ -29,7 +38,6 @@ async def scrape_images(url, max_imgs):
|
|
| 29 |
viewport={"width": 1366, "height": 768},
|
| 30 |
)
|
| 31 |
|
| 32 |
-
# Cargar cookies desde variable de entorno si está definida
|
| 33 |
cookies_env = os.getenv("COOKIES_JSON")
|
| 34 |
if cookies_env:
|
| 35 |
try:
|
|
@@ -79,24 +87,28 @@ async def scrape_images(url, max_imgs):
|
|
| 79 |
await browser.close()
|
| 80 |
return collected_data[:max_imgs]
|
| 81 |
|
| 82 |
-
def run_scraper(
|
|
|
|
|
|
|
| 83 |
return asyncio.run(scrape_images(url, int(max_imgs)))
|
| 84 |
|
| 85 |
-
def interface_fn(
|
| 86 |
-
results = run_scraper(
|
| 87 |
images = [(item["img_url"], f"Usuario: {item['user']}") for item in results]
|
| 88 |
return images
|
| 89 |
|
| 90 |
demo = gr.Interface(
|
| 91 |
fn=interface_fn,
|
| 92 |
inputs=[
|
| 93 |
-
gr.Textbox(label="URL
|
|
|
|
| 94 |
gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes")
|
| 95 |
],
|
| 96 |
outputs=gr.Gallery(label="Imágenes recolectadas"),
|
| 97 |
title="Scraper de Imágenes - DeviantArt",
|
| 98 |
-
description="Introduce
|
| 99 |
)
|
| 100 |
|
| 101 |
if __name__ == "__main__":
|
| 102 |
demo.launch()
|
|
|
|
|
|
| 13 |
import json
|
| 14 |
import gradio as gr
|
| 15 |
from playwright.async_api import async_playwright
|
| 16 |
+
from urllib.parse import quote_plus
|
| 17 |
|
| 18 |
USER_AGENT = (
|
| 19 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
|
| 21 |
"Chrome/91.0.4472.124 Safari/537.36"
|
| 22 |
)
|
| 23 |
|
| 24 |
+
def build_url(input_str):
|
| 25 |
+
input_str = input_str.strip()
|
| 26 |
+
if input_str.startswith("http://") or input_str.startswith("https://"):
|
| 27 |
+
return input_str
|
| 28 |
+
else:
|
| 29 |
+
categoria = quote_plus(input_str)
|
| 30 |
+
return f"https://www.deviantart.com/search?q={categoria}"
|
| 31 |
+
|
| 32 |
async def scrape_images(url, max_imgs):
|
| 33 |
max_imgs = max(10, min(max_imgs, 300))
|
| 34 |
async with async_playwright() as p:
|
|
|
|
| 38 |
viewport={"width": 1366, "height": 768},
|
| 39 |
)
|
| 40 |
|
|
|
|
| 41 |
cookies_env = os.getenv("COOKIES_JSON")
|
| 42 |
if cookies_env:
|
| 43 |
try:
|
|
|
|
| 87 |
await browser.close()
|
| 88 |
return collected_data[:max_imgs]
|
| 89 |
|
| 90 |
+
def run_scraper(user_input, max_imgs):
|
| 91 |
+
url = build_url(user_input)
|
| 92 |
+
print(f"Usando URL: {url}")
|
| 93 |
return asyncio.run(scrape_images(url, int(max_imgs)))
|
| 94 |
|
| 95 |
+
def interface_fn(user_input, max_imgs):
|
| 96 |
+
results = run_scraper(user_input, max_imgs)
|
| 97 |
images = [(item["img_url"], f"Usuario: {item['user']}") for item in results]
|
| 98 |
return images
|
| 99 |
|
| 100 |
demo = gr.Interface(
|
| 101 |
fn=interface_fn,
|
| 102 |
inputs=[
|
| 103 |
+
gr.Textbox(label="URL o Categoría DeviantArt", lines=1,
|
| 104 |
+
placeholder="Pega una URL o escribe una categoría o usuario"),
|
| 105 |
gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes")
|
| 106 |
],
|
| 107 |
outputs=gr.Gallery(label="Imágenes recolectadas"),
|
| 108 |
title="Scraper de Imágenes - DeviantArt",
|
| 109 |
+
description="Introduce una URL completa o solo una categoría/usuario para buscar imágenes."
|
| 110 |
)
|
| 111 |
|
| 112 |
if __name__ == "__main__":
|
| 113 |
demo.launch()
|
| 114 |
+
|