File size: 3,069 Bytes
4baad7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import streamlit as st
import requests
from bs4 import BeautifulSoup
import urllib3
import pandas as pd
import tempfile

def simple_web_scraper(url, scrape_option):
    try:
        # Create a PoolManager with urllib3 to handle SSL
        http = urllib3.PoolManager()

        # Send an HTTP request
        response = http.request('GET', url)

        # Check if the request was successful (status code 200)
        if response.status == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.data, 'html.parser')
            
            # Extract information from the HTML based on user's choice
            if scrape_option == 'data':
                # Extract all text content from the page
                all_text = soup.get_text()
                
                # Prepare data for the table (split text by lines)
                table_data = [{'Data': line.strip()} for line in all_text.split('\n') if line.strip()]
                
                # Display the data in a table
                st.table(table_data)

                # Save data to a temporary CSV file
                df = pd.DataFrame(table_data)
                csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
                df.to_csv(csv_file.name, index=False)

                # Provide a download button for the CSV file
                st.download_button(
                    label="Download Data as CSV",
                    data=open(csv_file.name, "rb").read(),
                    file_name="scraped_data.csv",
                    mime="text/csv",
                )
            elif scrape_option == 'links':
                # Example: Extract all the links on the page
                links = soup.find_all('a')
                
                # Prepare data for the table
                table_data = [{'Links': link.get('href')} for link in links if link.get('href')]
                
                # Display the data in a table
                st.table(table_data)

                # Save links to a temporary CSV file
                df = pd.DataFrame(table_data)
                csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
                df.to_csv(csv_file.name, index=False)

                # Provide a download button for the CSV file
                st.download_button(
                    label="Download Links as CSV",
                    data=open(csv_file.name, "rb").read(),
                    file_name="scraped_links.csv",
                    mime="text/csv",
                )
            else:
                st.write('Invalid scrape option. Please choose "data" or "links".')
        else:
            st.write(f'Error: {response.status}')
    
    except Exception as e:
        st.write(f'An error occurred: {e}')

# Streamlit UI
st.title("Web Scraping Tool")
website_url = st.text_input("Enter the URL to scrape:")
scrape_option = st.selectbox("Select what to scrape:", ['data', 'links'])

if st.button("Scrape"):
    simple_web_scraper(website_url, scrape_option)