SmokeyBandit's picture
Create app.py
613861f verified
# ----------------------
# app.py
# ----------------------
import time
import gradio as gr
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
def dynamic_scrape(url):
"""
Launch a headless browser via Playwright, navigate to `url`,
wait for JavaScript to load, and return the rendered HTML.
"""
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Go to the URL
page.goto(url)
# Wait a few seconds (or for a specific element) to ensure JS is loaded
page.wait_for_timeout(3000) # 3 seconds
rendered_html = page.content()
browser.close()
return rendered_html
except Exception as e:
return f"Error: {e}"
def scrape_and_parse(url):
"""
Scrape dynamic content, then parse with BeautifulSoup for demonstration.
"""
html = dynamic_scrape(url)
soup = BeautifulSoup(html, "html.parser")
# Grab all <p> elements as an example
paragraphs = soup.find_all("p")
if not paragraphs:
return "No <p> tags found, or site is heavily JavaScript-based."
text_content = "\n\n".join([p.get_text() for p in paragraphs])
return text_content.strip()
def on_scrape(url):
"""
Gradio handler function: performs dynamic scrape and returns results.
"""
if not url.startswith("http"):
return "Please enter a valid URL starting with http or https."
return scrape_and_parse(url)
with gr.Blocks(title="Playwright Scraper") as demo:
gr.Markdown("## JavaScript-Aware Web Scraper\n"
"Enter a URL to scrape dynamic, JavaScript-rendered content using Playwright.")
url_input = gr.Textbox(label="URL", value="https://example.com")
output_box = gr.Textbox(label="Scraped Content", lines=10)
scrape_button = gr.Button("Scrape")
scrape_button.click(fn=on_scrape, inputs=url_input, outputs=output_box)
demo.launch(server_name="0.0.0.0", server_port=7860)