Spaces:
Sleeping
Sleeping
File size: 3,738 Bytes
ec67f77 46325f0 ec67f77 788a917 ec67f77 46325f0 2f54431 ec67f77 46325f0 ec67f77 2f54431 46325f0 ec67f77 2f54431 ec67f77 2f54431 ec67f77 46325f0 ec67f77 2f54431 ec67f77 46325f0 2f54431 46325f0 ec67f77 46325f0 ec67f77 46325f0 ec67f77 2f54431 ec67f77 2f54431 ec67f77 46325f0 ec67f77 46325f0 ec67f77 46325f0 ec67f77 87104d9 ec67f77 2f54431 46325f0 2f54431 ec67f77 46325f0 ec67f77 46325f0 ec67f77 2f54431 46325f0 ec67f77 46325f0 ec67f77 46325f0 ec67f77 46325f0 ec67f77 46325f0 2f54431 46325f0 ec67f77 46325f0 ec67f77 46325f0 ec67f77 46325f0 ec67f77 46325f0 ec67f77 46325f0 2f54431 46325f0 788a917 46325f0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | # =========================================================
# ๐ WEBSITE CRAWLER + DOWNLOAD TOOL
# =========================================================
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
# ==============================
# PAGE CONFIG
# ==============================
st.set_page_config(page_title="๐ Website Crawler", layout="wide")
# ==============================
# SESSION STATE
# ==============================
if "links" not in st.session_state:
st.session_state.links = []
if "data" not in st.session_state:
st.session_state.data = []
# ==============================
# CRAWL WEBSITE
# ==============================
def crawl_website(url):
try:
res = requests.get(url, timeout=10)
soup = BeautifulSoup(res.text, "html.parser")
links = set()
for a in soup.find_all("a", href=True):
link = urljoin(url, a["href"])
if link.startswith("http"):
links.add(link)
return list(links)[:30]
except:
return []
# ==============================
# EXTRACT PAGE CONTENT
# ==============================
def extract_page(url):
try:
res = requests.get(url, timeout=10)
soup = BeautifulSoup(res.text, "html.parser")
# TEXT
paragraphs = [p.get_text().strip() for p in soup.find_all("p")]
text = " ".join(paragraphs)
# IMAGES
images = []
for img in soup.find_all("img"):
img_url = urljoin(url, img.get("src"))
images.append(img_url)
return {
"url": url,
"text": text,
"images": images
}
except:
return None
# ==============================
# UI
# ==============================
st.title("๐Efficient Website Crawler")
# ==============================
# STEP 1: ENTER URL
# ==============================
url = st.text_input("๐ Enter Website URL")
if st.button("Crawl Website"):
links = crawl_website(url)
if links:
st.session_state.links = links
st.success(f"Found {len(links)} pages")
else:
st.error("No links found")
# ==============================
# STEP 2: SELECT PAGES
# ==============================
selected_links = []
if st.session_state.links:
st.subheader("๐ Select Pages to Crawl")
for link in st.session_state.links:
if st.checkbox(link):
selected_links.append(link)
# ==============================
# STEP 3: EXTRACT DATA
# ==============================
if st.button("Extract Selected Pages"):
all_data = []
with st.spinner("Extracting content..."):
for link in selected_links:
data = extract_page(link)
if data:
all_data.append(data)
if all_data:
st.session_state.data = all_data
st.success("โ
Data extracted successfully!")
else:
st.warning("No data extracted")
# ==============================
# STEP 4: SHOW DATA
# ==============================
if st.session_state.data:
st.subheader("๐ Extracted Data Preview")
df = pd.DataFrame(st.session_state.data)
st.dataframe(df)
# ==============================
# STEP 5: DOWNLOAD OPTIONS
# ==============================
if st.session_state.data:
st.subheader("โฌ๏ธ Download Data")
df = pd.DataFrame(st.session_state.data)
# CSV
csv = df.to_csv(index=False).encode("utf-8")
st.download_button("Download CSV", csv, "website_data.csv", "text/csv")
# JSON
json_data = df.to_json(orient="records", indent=2)
st.download_button("Download JSON", json_data, "website_data.json", "application/json") |