File size: 3,738 Bytes
ec67f77
46325f0
ec67f77
 
788a917
ec67f77
 
46325f0
2f54431
ec67f77
 
 
 
46325f0
ec67f77
 
 
 
2f54431
 
 
46325f0
 
 
ec67f77
 
 
 
 
2f54431
ec67f77
 
2f54431
 
ec67f77
46325f0
ec67f77
2f54431
ec67f77
46325f0
2f54431
46325f0
ec67f77
 
 
46325f0
ec67f77
46325f0
ec67f77
2f54431
ec67f77
 
 
2f54431
ec67f77
 
46325f0
 
 
 
 
ec67f77
46325f0
 
 
 
 
ec67f77
 
46325f0
ec67f77
 
 
 
87104d9
ec67f77
2f54431
46325f0
2f54431
ec67f77
 
 
 
 
 
 
 
 
46325f0
ec67f77
 
46325f0
ec67f77
2f54431
 
 
46325f0
ec67f77
 
 
 
 
 
46325f0
ec67f77
46325f0
 
ec67f77
46325f0
 
 
 
 
ec67f77
46325f0
 
 
2f54431
46325f0
ec67f77
 
46325f0
ec67f77
46325f0
 
ec67f77
46325f0
 
ec67f77
46325f0
 
 
 
 
ec67f77
46325f0
2f54431
46325f0
 
 
788a917
46325f0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# =========================================================
# ๐ŸŒ WEBSITE CRAWLER + DOWNLOAD TOOL
# =========================================================

import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

# ==============================
# PAGE CONFIG
# ==============================
st.set_page_config(page_title="๐ŸŒ Website Crawler", layout="wide")

# ==============================
# SESSION STATE
# ==============================
if "links" not in st.session_state:
    st.session_state.links = []

if "data" not in st.session_state:
    st.session_state.data = []

# ==============================
# CRAWL WEBSITE
# ==============================
def crawl_website(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")

        links = set()

        for a in soup.find_all("a", href=True):
            link = urljoin(url, a["href"])
            if link.startswith("http"):
                links.add(link)

        return list(links)[:30]

    except:
        return []

# ==============================
# EXTRACT PAGE CONTENT
# ==============================
def extract_page(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")

        # TEXT
        paragraphs = [p.get_text().strip() for p in soup.find_all("p")]
        text = " ".join(paragraphs)

        # IMAGES
        images = []
        for img in soup.find_all("img"):
            img_url = urljoin(url, img.get("src"))
            images.append(img_url)

        return {
            "url": url,
            "text": text,
            "images": images
        }

    except:
        return None

# ==============================
# UI
# ==============================
st.title("๐ŸŒEfficient Website Crawler")

# ==============================
# STEP 1: ENTER URL
# ==============================
url = st.text_input("๐Ÿ”— Enter Website URL")

if st.button("Crawl Website"):
    links = crawl_website(url)

    if links:
        st.session_state.links = links
        st.success(f"Found {len(links)} pages")
    else:
        st.error("No links found")

# ==============================
# STEP 2: SELECT PAGES
# ==============================
selected_links = []

if st.session_state.links:
    st.subheader("๐Ÿ“„ Select Pages to Crawl")

    for link in st.session_state.links:
        if st.checkbox(link):
            selected_links.append(link)

# ==============================
# STEP 3: EXTRACT DATA
# ==============================
if st.button("Extract Selected Pages"):
    all_data = []

    with st.spinner("Extracting content..."):
        for link in selected_links:
            data = extract_page(link)
            if data:
                all_data.append(data)

    if all_data:
        st.session_state.data = all_data
        st.success("โœ… Data extracted successfully!")
    else:
        st.warning("No data extracted")

# ==============================
# STEP 4: SHOW DATA
# ==============================
if st.session_state.data:
    st.subheader("๐Ÿ“Š Extracted Data Preview")

    df = pd.DataFrame(st.session_state.data)
    st.dataframe(df)

# ==============================
# STEP 5: DOWNLOAD OPTIONS
# ==============================
if st.session_state.data:
    st.subheader("โฌ‡๏ธ Download Data")

    df = pd.DataFrame(st.session_state.data)

    # CSV
    csv = df.to_csv(index=False).encode("utf-8")
    st.download_button("Download CSV", csv, "website_data.csv", "text/csv")

    # JSON
    json_data = df.to_json(orient="records", indent=2)
    st.download_button("Download JSON", json_data, "website_data.json", "application/json")