Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import urllib3 | |
| import pandas as pd | |
| import tempfile | |
| def simple_web_scraper(url, scrape_option): | |
| try: | |
| # Create a PoolManager with urllib3 to handle SSL | |
| http = urllib3.PoolManager() | |
| # Send an HTTP request | |
| response = http.request('GET', url) | |
| # Check if the request was successful (status code 200) | |
| if response.status == 200: | |
| # Parse the HTML content of the page | |
| soup = BeautifulSoup(response.data, 'html.parser') | |
| # Extract information from the HTML based on user's choice | |
| if scrape_option == 'data': | |
| # Extract all text content from the page | |
| all_text = soup.get_text() | |
| # Prepare data for the table (split text by lines) | |
| table_data = [{'Data': line.strip()} for line in all_text.split('\n') if line.strip()] | |
| # Display the data in a table | |
| st.table(table_data) | |
| # Save data to a temporary CSV file | |
| df = pd.DataFrame(table_data) | |
| csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") | |
| df.to_csv(csv_file.name, index=False) | |
| # Provide a download button for the CSV file | |
| st.download_button( | |
| label="Download Data as CSV", | |
| data=open(csv_file.name, "rb").read(), | |
| file_name="scraped_data.csv", | |
| mime="text/csv", | |
| ) | |
| elif scrape_option == 'links': | |
| # Example: Extract all the links on the page | |
| links = soup.find_all('a') | |
| # Prepare data for the table | |
| table_data = [{'Links': link.get('href')} for link in links if link.get('href')] | |
| # Display the data in a table | |
| st.table(table_data) | |
| # Save links to a temporary CSV file | |
| df = pd.DataFrame(table_data) | |
| csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") | |
| df.to_csv(csv_file.name, index=False) | |
| # Provide a download button for the CSV file | |
| st.download_button( | |
| label="Download Links as CSV", | |
| data=open(csv_file.name, "rb").read(), | |
| file_name="scraped_links.csv", | |
| mime="text/csv", | |
| ) | |
| else: | |
| st.write('Invalid scrape option. Please choose "data" or "links".') | |
| else: | |
| st.write(f'Error: {response.status}') | |
| except Exception as e: | |
| st.write(f'An error occurred: {e}') | |
| # Streamlit UI | |
| st.title("Web Scraping Tool") | |
| website_url = st.text_input("Enter the URL to scrape:") | |
| scrape_option = st.selectbox("Select what to scrape:", ['data', 'links']) | |
| if st.button("Scrape"): | |
| simple_web_scraper(website_url, scrape_option) | |