# ---------------------- # app.py # ---------------------- import time import gradio as gr import requests from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright def dynamic_scrape(url): """ Launch a headless browser via Playwright, navigate to `url`, wait for JavaScript to load, and return the rendered HTML. """ try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() # Go to the URL page.goto(url) # Wait a few seconds (or for a specific element) to ensure JS is loaded page.wait_for_timeout(3000) # 3 seconds rendered_html = page.content() browser.close() return rendered_html except Exception as e: return f"Error: {e}" def scrape_and_parse(url): """ Scrape dynamic content, then parse with BeautifulSoup for demonstration. """ html = dynamic_scrape(url) soup = BeautifulSoup(html, "html.parser") # Grab all
elements as an example paragraphs = soup.find_all("p") if not paragraphs: return "No
tags found, or site is heavily JavaScript-based." text_content = "\n\n".join([p.get_text() for p in paragraphs]) return text_content.strip() def on_scrape(url): """ Gradio handler function: performs dynamic scrape and returns results. """ if not url.startswith("http"): return "Please enter a valid URL starting with http or https." return scrape_and_parse(url) with gr.Blocks(title="Playwright Scraper") as demo: gr.Markdown("## JavaScript-Aware Web Scraper\n" "Enter a URL to scrape dynamic, JavaScript-rendered content using Playwright.") url_input = gr.Textbox(label="URL", value="https://example.com") output_box = gr.Textbox(label="Scraped Content", lines=10) scrape_button = gr.Button("Scrape") scrape_button.click(fn=on_scrape, inputs=url_input, outputs=output_box) demo.launch(server_name="0.0.0.0", server_port=7860)