SmokeyBandit commited on
Commit
613861f
·
verified ·
1 Parent(s): 852aecc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ----------------------
2
+ # app.py
3
+ # ----------------------
4
+ import time
5
+ import gradio as gr
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from playwright.sync_api import sync_playwright
9
+
10
+ def dynamic_scrape(url):
11
+ """
12
+ Launch a headless browser via Playwright, navigate to `url`,
13
+ wait for JavaScript to load, and return the rendered HTML.
14
+ """
15
+ try:
16
+ with sync_playwright() as p:
17
+ browser = p.chromium.launch(headless=True)
18
+ page = browser.new_page()
19
+
20
+ # Go to the URL
21
+ page.goto(url)
22
+
23
+ # Wait a few seconds (or for a specific element) to ensure JS is loaded
24
+ page.wait_for_timeout(3000) # 3 seconds
25
+ rendered_html = page.content()
26
+
27
+ browser.close()
28
+ return rendered_html
29
+ except Exception as e:
30
+ return f"Error: {e}"
31
+
32
+ def scrape_and_parse(url):
33
+ """
34
+ Scrape dynamic content, then parse with BeautifulSoup for demonstration.
35
+ """
36
+ html = dynamic_scrape(url)
37
+ soup = BeautifulSoup(html, "html.parser")
38
+
39
+ # Grab all <p> elements as an example
40
+ paragraphs = soup.find_all("p")
41
+ if not paragraphs:
42
+ return "No <p> tags found, or site is heavily JavaScript-based."
43
+
44
+ text_content = "\n\n".join([p.get_text() for p in paragraphs])
45
+ return text_content.strip()
46
+
47
+ def on_scrape(url):
48
+ """
49
+ Gradio handler function: performs dynamic scrape and returns results.
50
+ """
51
+ if not url.startswith("http"):
52
+ return "Please enter a valid URL starting with http or https."
53
+ return scrape_and_parse(url)
54
+
55
+ with gr.Blocks(title="Playwright Scraper") as demo:
56
+ gr.Markdown("## JavaScript-Aware Web Scraper\n"
57
+ "Enter a URL to scrape dynamic, JavaScript-rendered content using Playwright.")
58
+
59
+ url_input = gr.Textbox(label="URL", value="https://example.com")
60
+ output_box = gr.Textbox(label="Scraped Content", lines=10)
61
+ scrape_button = gr.Button("Scrape")
62
+
63
+ scrape_button.click(fn=on_scrape, inputs=url_input, outputs=output_box)
64
+
65
+ demo.launch(server_name="0.0.0.0", server_port=7860)