Spaces:

bep40
/

bot-bqf6pz2f

No application file

App Files Files Community

bot-bqf6pz2f / src /streamlit_app.py

bep40

Upload src/streamlit_app.py with huggingface_hub

600deca verified 4 months ago

raw

history blame contribute delete

2.98 kB

	Files: 2 files loaded

	=== app.py ===
	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import re

	# Function to scrape only visible text from the given URL
	def scrape_visible_text_from_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script, style, and other non-visible tags
	for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
	tag.extract()

	# Get the header content
	header_content = soup.find("header")
	header_text = header_content.get_text() if header_content else ""

	# Get the paragraph content
	paragraph_content = soup.find_all("p")
	paragraph_text = " ".join([p.get_text() for p in paragraph_content])

	# Combine header and paragraph text
	visible_text = f"{header_text}\n\n{paragraph_text}"

	# Remove multiple whitespaces and newlines
	visible_text = re.sub(r'\s+', ' ', visible_text)
	return visible_text.strip()
	except Exception as e:
	st.error(f"Error occurred while scraping the data: {e}")
	return None

	# Streamlit UI
	def main():
	st.title("Web Data Scraper")

	# Get the URL from the user
	url_input = st.text_input("Enter the URL of the web page:", "")

	if st.button("Scrape Visible Text"):
	if url_input:
	# Extract visible text from the URL
	data = scrape_visible_text_from_url(url_input)
	if data:
	st.success("Visible text successfully scraped!")
	st.subheader("Scraped Text:")
	st.write(data)
	else:
	st.warning("Failed to scrape visible text from the URL.")
	else:
	st.warning("Please enter a valid URL.")

	if __name__ == "__main__":
	main()


	=== requirements.txt ===
	aiohttp==3.8.5
	aiosignal==1.3.1
	altair==5.0.1
	async-timeout==4.0.2
	attrs==23.1.0
	beautifulsoup4==4.12.2
	blinker==1.6.2
	bs4==0.0.1
	cachetools==5.3.1
	certifi==2023.7.22
	charset-normalizer==3.2.0
	click==8.1.6
	decorator==5.1.1
	frozenlist==1.4.0
	gitdb==4.0.10
	GitPython==3.1.32
	idna==3.4
	importlib-metadata==6.8.0
	Jinja2==3.1.2
	jsonschema==4.18.4
	jsonschema-specifications==2023.7.1
	markdown-it-py==3.0.0
	MarkupSafe==2.1.3
	mdurl==0.1.2
	multidict==6.0.4
	numpy==1.25.2
	openai==0.27.8
	packaging==23.1
	pandas==2.0.3
	Pillow==9.5.0
	protobuf==4.23.4
	pyarrow==12.0.1
	pydeck==0.8.0
	Pygments==2.15.1
	Pympler==1.0.1
	python-dateutil==2.8.2
	python-dotenv==1.0.0
	pytz==2023.3
	pytz-deprecation-shim==0.1.0.post0
	referencing==0.30.0
	requests==2.31.0
	rich==13.5.2
	rpds-py==0.9.2
	six==1.16.0
	smmap==5.0.0
	soupsieve==2.4.1
	streamlit==1.25.0
	tenacity==8.2.2
	toml==0.10.2
	toolz==0.12.0
	tornado==6.3.2
	tqdm==4.65.0
	typing_extensions==4.7.1
	tzdata==2023.3
	tzlocal==4.3.1
	urllib3==2.0.4
	validators==0.20.0
	watchdog==3.0.0
	yarl==1.9.2
	zipp==3.16.2