Spaces:
Sleeping
Sleeping
| # ---------------------- | |
| # app.py | |
| # ---------------------- | |
| import time | |
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from playwright.sync_api import sync_playwright | |
| def dynamic_scrape(url): | |
| """ | |
| Launch a headless browser via Playwright, navigate to `url`, | |
| wait for JavaScript to load, and return the rendered HTML. | |
| """ | |
| try: | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True) | |
| page = browser.new_page() | |
| # Go to the URL | |
| page.goto(url) | |
| # Wait a few seconds (or for a specific element) to ensure JS is loaded | |
| page.wait_for_timeout(3000) # 3 seconds | |
| rendered_html = page.content() | |
| browser.close() | |
| return rendered_html | |
| except Exception as e: | |
| return f"Error: {e}" | |
| def scrape_and_parse(url): | |
| """ | |
| Scrape dynamic content, then parse with BeautifulSoup for demonstration. | |
| """ | |
| html = dynamic_scrape(url) | |
| soup = BeautifulSoup(html, "html.parser") | |
| # Grab all <p> elements as an example | |
| paragraphs = soup.find_all("p") | |
| if not paragraphs: | |
| return "No <p> tags found, or site is heavily JavaScript-based." | |
| text_content = "\n\n".join([p.get_text() for p in paragraphs]) | |
| return text_content.strip() | |
| def on_scrape(url): | |
| """ | |
| Gradio handler function: performs dynamic scrape and returns results. | |
| """ | |
| if not url.startswith("http"): | |
| return "Please enter a valid URL starting with http or https." | |
| return scrape_and_parse(url) | |
| with gr.Blocks(title="Playwright Scraper") as demo: | |
| gr.Markdown("## JavaScript-Aware Web Scraper\n" | |
| "Enter a URL to scrape dynamic, JavaScript-rendered content using Playwright.") | |
| url_input = gr.Textbox(label="URL", value="https://example.com") | |
| output_box = gr.Textbox(label="Scraped Content", lines=10) | |
| scrape_button = gr.Button("Scrape") | |
| scrape_button.click(fn=on_scrape, inputs=url_input, outputs=output_box) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |