File size: 3,708 Bytes
8670562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from requests.exceptions import RequestException

# Function to analyze available tags on the page
def analyze_page(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        useful_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'img', 'ul', 'ol', 'li'}
        available_tags = {tag.name for tag in soup.find_all(True) if tag.name in useful_tags}
        return list(available_tags)
    except RequestException as e:
        st.error(f"Failed to fetch the page: {str(e)}")
        return None

# Function to scrape selected tags
def scrape_data(url, selected_tags):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        data = []
        for tag in selected_tags:
            for item in soup.find_all(tag):
                if tag == 'img':
                    data.append({'Type': tag, 'Src': item.get('src', ''), 'Alt Text': item.get('alt', '')})
                elif tag == 'a':
                    data.append({'Type': tag, 'URL': item.get('href', ''), 'Text': item.get_text(strip=True)})
                else:
                    data.append({'Type': tag, 'Content': item.get_text(strip=True)})
        return pd.DataFrame(data)
    except RequestException as e:
        st.error(f"Error scraping data: {str(e)}")
        return None

# Main Streamlit app
def main():
    st.title("Live Web Scraper")
    st.write("Enter a URL and select the HTML tags you want to scrape!")

    # URL input
    url = st.text_input("Enter the URL to scrape:", key="url_input")

    if url:
        with st.spinner("Analyzing the page..."):
            available_tags = analyze_page(url)

        if available_tags:
            # Tag selection
            selected_tags = st.multiselect("Select tags to scrape:", available_tags, key="tag_select")

            if st.button("Scrape Now", key="scrape_button"):
                with st.spinner("Scraping data..."):
                    df = scrape_data(url, selected_tags)

                if df is not None and not df.empty:
                    st.success("Scraping complete!")
                    st.write("### Scraped Data:")
                    st.dataframe(df)

                    # Download options
                    col1, col2 = st.columns(2)
                    with col1:
                        csv = df.to_csv(index=False).encode('utf-8')
                        st.download_button(
                            label="Download as CSV",
                            data=csv,
                            file_name="scraped_data.csv",
                            mime="text/csv",
                            key="csv_download"
                        )
                    with col2:
                        json_data = df.to_json(orient='records')
                        st.download_button(
                            label="Download as JSON",
                            data=json_data,
                            file_name="scraped_data.json",
                            mime="application/json",
                            key="json_download"
                        )
                else:
                    st.warning("No data found for the selected tags.")
        else:
            st.error("Couldn’t analyze the page. Check the URL and try again.")

if __name__ == "__main__":
    main()