Spaces:

guifav
/

text_tiger

Build error

text_tiger / app.py

Updated app to use Gradio instead of Streamlit

302b997 over 1 year ago

1.77 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import re

	def scrape_visible_text_from_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script, style, and other non-visible tags
	for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
	tag.extract()

	# Get the header content
	header_content = soup.find("header")
	header_text = header_content.get_text() if header_content else ""

	# Get the paragraph content
	paragraph_content = soup.find_all("p")
	paragraph_text = " ".join([p.get_text() for p in paragraph_content])

	# Combine header and paragraph text
	visible_text = f"{header_text}\n\n{paragraph_text}"

	# Remove multiple whitespaces and newlines
	visible_text = re.sub(r'\s+', ' ', visible_text)
	return visible_text.strip()
	except Exception as e:
	return f"Error occurred while scraping the data: {e}"

	def scrape_and_display(url):
	if url:
	data = scrape_visible_text_from_url(url)
	if data:
	return data
	else:
	return "Failed to scrape visible text from the URL."
	else:
	return "Please enter a valid URL."

	# Define the Gradio interface
	iface = gr.Interface(
	fn=scrape_and_display,
	inputs=gr.Textbox(label="Enter the URL of the web page:"),
	outputs=gr.Textbox(label="Scraped Text:"),
	title="Web Data Scraper",
	description="Enter a URL to scrape visible text from the web page.",
	theme="huggingface"
	)

	# Launch the Gradio app
	if __name__ == "__main__":
	iface.launch()