Spaces:

SmokeyBandit
/

testdockerspace1

Sleeping

App Files Files Community

SmokeyBandit commited on Feb 17, 2025

Commit

613861f

verified ·

1 Parent(s): 852aecc

Create app.py

Browse files

Files changed (1) hide show

app.py +65 -0

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# ----------------------
+#  app.py
+# ----------------------
+import time
+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from playwright.sync_api import sync_playwright
+def dynamic_scrape(url):
+    """
+    Launch a headless browser via Playwright, navigate to `url`,
+    wait for JavaScript to load, and return the rendered HTML.
+    """
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page()
+            # Go to the URL
+            page.goto(url)
+            # Wait a few seconds (or for a specific element) to ensure JS is loaded
+            page.wait_for_timeout(3000)  # 3 seconds
+            rendered_html = page.content()
+            browser.close()
+        return rendered_html
+    except Exception as e:
+        return f"Error: {e}"
+def scrape_and_parse(url):
+    """
+    Scrape dynamic content, then parse with BeautifulSoup for demonstration.
+    """
+    html = dynamic_scrape(url)
+    soup = BeautifulSoup(html, "html.parser")
+    # Grab all <p> elements as an example
+    paragraphs = soup.find_all("p")
+    if not paragraphs:
+        return "No <p> tags found, or site is heavily JavaScript-based."
+    text_content = "\n\n".join([p.get_text() for p in paragraphs])
+    return text_content.strip()
+def on_scrape(url):
+    """
+    Gradio handler function: performs dynamic scrape and returns results.
+    """
+    if not url.startswith("http"):
+        return "Please enter a valid URL starting with http or https."
+    return scrape_and_parse(url)
+with gr.Blocks(title="Playwright Scraper") as demo:
+    gr.Markdown("## JavaScript-Aware Web Scraper\n"
+                "Enter a URL to scrape dynamic, JavaScript-rendered content using Playwright.")
+    url_input = gr.Textbox(label="URL", value="https://example.com")
+    output_box = gr.Textbox(label="Scraped Content", lines=10)
+    scrape_button = gr.Button("Scrape")
+    scrape_button.click(fn=on_scrape, inputs=url_input, outputs=output_box)
+demo.launch(server_name="0.0.0.0", server_port=7860)