Spaces:
Sleeping
Sleeping
File size: 2,078 Bytes
613861f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# ----------------------
# app.py
# ----------------------
import time
import gradio as gr
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
def dynamic_scrape(url):
"""
Launch a headless browser via Playwright, navigate to `url`,
wait for JavaScript to load, and return the rendered HTML.
"""
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Go to the URL
page.goto(url)
# Wait a few seconds (or for a specific element) to ensure JS is loaded
page.wait_for_timeout(3000) # 3 seconds
rendered_html = page.content()
browser.close()
return rendered_html
except Exception as e:
return f"Error: {e}"
def scrape_and_parse(url):
"""
Scrape dynamic content, then parse with BeautifulSoup for demonstration.
"""
html = dynamic_scrape(url)
soup = BeautifulSoup(html, "html.parser")
# Grab all <p> elements as an example
paragraphs = soup.find_all("p")
if not paragraphs:
return "No <p> tags found, or site is heavily JavaScript-based."
text_content = "\n\n".join([p.get_text() for p in paragraphs])
return text_content.strip()
def on_scrape(url):
"""
Gradio handler function: performs dynamic scrape and returns results.
"""
if not url.startswith("http"):
return "Please enter a valid URL starting with http or https."
return scrape_and_parse(url)
with gr.Blocks(title="Playwright Scraper") as demo:
gr.Markdown("## JavaScript-Aware Web Scraper\n"
"Enter a URL to scrape dynamic, JavaScript-rendered content using Playwright.")
url_input = gr.Textbox(label="URL", value="https://example.com")
output_box = gr.Textbox(label="Scraped Content", lines=10)
scrape_button = gr.Button("Scrape")
scrape_button.click(fn=on_scrape, inputs=url_input, outputs=output_box)
demo.launch(server_name="0.0.0.0", server_port=7860)
|