Spaces:

Engineer786
/

WebScraper

Sleeping

App Files Files Community

WebScraper / app.py

Engineer786

Create app.py

4baad7f verified about 1 year ago

raw

history blame

3.07 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import urllib3
	import pandas as pd
	import tempfile

	def simple_web_scraper(url, scrape_option):
	try:
	# Create a PoolManager with urllib3 to handle SSL
	http = urllib3.PoolManager()

	# Send an HTTP request
	response = http.request('GET', url)

	# Check if the request was successful (status code 200)
	if response.status == 200:
	# Parse the HTML content of the page
	soup = BeautifulSoup(response.data, 'html.parser')

	# Extract information from the HTML based on user's choice
	if scrape_option == 'data':
	# Extract all text content from the page
	all_text = soup.get_text()

	# Prepare data for the table (split text by lines)
	table_data = [{'Data': line.strip()} for line in all_text.split('\n') if line.strip()]

	# Display the data in a table
	st.table(table_data)

	# Save data to a temporary CSV file
	df = pd.DataFrame(table_data)
	csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
	df.to_csv(csv_file.name, index=False)

	# Provide a download button for the CSV file
	st.download_button(
	label="Download Data as CSV",
	data=open(csv_file.name, "rb").read(),
	file_name="scraped_data.csv",
	mime="text/csv",
	)
	elif scrape_option == 'links':
	# Example: Extract all the links on the page
	links = soup.find_all('a')

	# Prepare data for the table
	table_data = [{'Links': link.get('href')} for link in links if link.get('href')]

	# Display the data in a table
	st.table(table_data)

	# Save links to a temporary CSV file
	df = pd.DataFrame(table_data)
	csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
	df.to_csv(csv_file.name, index=False)

	# Provide a download button for the CSV file
	st.download_button(
	label="Download Links as CSV",
	data=open(csv_file.name, "rb").read(),
	file_name="scraped_links.csv",
	mime="text/csv",
	)
	else:
	st.write('Invalid scrape option. Please choose "data" or "links".')
	else:
	st.write(f'Error: {response.status}')

	except Exception as e:
	st.write(f'An error occurred: {e}')

	# Streamlit UI
	st.title("Web Scraping Tool")
	website_url = st.text_input("Enter the URL to scrape:")
	scrape_option = st.selectbox("Select what to scrape:", ['data', 'links'])

	if st.button("Scrape"):
	simple_web_scraper(website_url, scrape_option)