Spaces:
Sleeping
Sleeping
File size: 3,069 Bytes
4baad7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import streamlit as st
import requests
from bs4 import BeautifulSoup
import urllib3
import pandas as pd
import tempfile
def simple_web_scraper(url, scrape_option):
try:
# Create a PoolManager with urllib3 to handle SSL
http = urllib3.PoolManager()
# Send an HTTP request
response = http.request('GET', url)
# Check if the request was successful (status code 200)
if response.status == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.data, 'html.parser')
# Extract information from the HTML based on user's choice
if scrape_option == 'data':
# Extract all text content from the page
all_text = soup.get_text()
# Prepare data for the table (split text by lines)
table_data = [{'Data': line.strip()} for line in all_text.split('\n') if line.strip()]
# Display the data in a table
st.table(table_data)
# Save data to a temporary CSV file
df = pd.DataFrame(table_data)
csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
df.to_csv(csv_file.name, index=False)
# Provide a download button for the CSV file
st.download_button(
label="Download Data as CSV",
data=open(csv_file.name, "rb").read(),
file_name="scraped_data.csv",
mime="text/csv",
)
elif scrape_option == 'links':
# Example: Extract all the links on the page
links = soup.find_all('a')
# Prepare data for the table
table_data = [{'Links': link.get('href')} for link in links if link.get('href')]
# Display the data in a table
st.table(table_data)
# Save links to a temporary CSV file
df = pd.DataFrame(table_data)
csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
df.to_csv(csv_file.name, index=False)
# Provide a download button for the CSV file
st.download_button(
label="Download Links as CSV",
data=open(csv_file.name, "rb").read(),
file_name="scraped_links.csv",
mime="text/csv",
)
else:
st.write('Invalid scrape option. Please choose "data" or "links".')
else:
st.write(f'Error: {response.status}')
except Exception as e:
st.write(f'An error occurred: {e}')
# Streamlit UI
st.title("Web Scraping Tool")
website_url = st.text_input("Enter the URL to scrape:")
scrape_option = st.selectbox("Select what to scrape:", ['data', 'links'])
if st.button("Scrape"):
simple_web_scraper(website_url, scrape_option)
|