Spaces:

SmokeyBandit
/

testdockerspace1

Sleeping

App Files Files Community

testdockerspace1 / app.py

SmokeyBandit

Create app.py

613861f verified 11 months ago

raw

history blame contribute delete

2.08 kB

	# ----------------------
	# app.py
	# ----------------------
	import time
	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from playwright.sync_api import sync_playwright

	def dynamic_scrape(url):
	"""
	Launch a headless browser via Playwright, navigate to `url`,
	wait for JavaScript to load, and return the rendered HTML.
	"""
	try:
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	page = browser.new_page()

	# Go to the URL
	page.goto(url)

	# Wait a few seconds (or for a specific element) to ensure JS is loaded
	page.wait_for_timeout(3000) # 3 seconds
	rendered_html = page.content()

	browser.close()
	return rendered_html
	except Exception as e:
	return f"Error: {e}"

	def scrape_and_parse(url):
	"""
	Scrape dynamic content, then parse with BeautifulSoup for demonstration.
	"""
	html = dynamic_scrape(url)
	soup = BeautifulSoup(html, "html.parser")

	# Grab all <p> elements as an example
	paragraphs = soup.find_all("p")
	if not paragraphs:
	return "No <p> tags found, or site is heavily JavaScript-based."

	text_content = "\n\n".join([p.get_text() for p in paragraphs])
	return text_content.strip()

	def on_scrape(url):
	"""
	Gradio handler function: performs dynamic scrape and returns results.
	"""
	if not url.startswith("http"):
	return "Please enter a valid URL starting with http or https."
	return scrape_and_parse(url)

	with gr.Blocks(title="Playwright Scraper") as demo:
	gr.Markdown("## JavaScript-Aware Web Scraper\n"
	"Enter a URL to scrape dynamic, JavaScript-rendered content using Playwright.")

	url_input = gr.Textbox(label="URL", value="https://example.com")
	output_box = gr.Textbox(label="Scraped Content", lines=10)
	scrape_button = gr.Button("Scrape")

	scrape_button.click(fn=on_scrape, inputs=url_input, outputs=output_box)

	demo.launch(server_name="0.0.0.0", server_port=7860)