sample_crawler / src /streamlit_app.py
pradeep4321's picture
Update src/streamlit_app.py
87104d9 verified
# =========================================================
# ๐ŸŒ WEBSITE CRAWLER + DOWNLOAD TOOL
# =========================================================
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
# ==============================
# PAGE CONFIG
# ==============================
st.set_page_config(page_title="๐ŸŒ Website Crawler", layout="wide")
# ==============================
# SESSION STATE
# ==============================
if "links" not in st.session_state:
st.session_state.links = []
if "data" not in st.session_state:
st.session_state.data = []
# ==============================
# CRAWL WEBSITE
# ==============================
def crawl_website(url):
try:
res = requests.get(url, timeout=10)
soup = BeautifulSoup(res.text, "html.parser")
links = set()
for a in soup.find_all("a", href=True):
link = urljoin(url, a["href"])
if link.startswith("http"):
links.add(link)
return list(links)[:30]
except:
return []
# ==============================
# EXTRACT PAGE CONTENT
# ==============================
def extract_page(url):
try:
res = requests.get(url, timeout=10)
soup = BeautifulSoup(res.text, "html.parser")
# TEXT
paragraphs = [p.get_text().strip() for p in soup.find_all("p")]
text = " ".join(paragraphs)
# IMAGES
images = []
for img in soup.find_all("img"):
img_url = urljoin(url, img.get("src"))
images.append(img_url)
return {
"url": url,
"text": text,
"images": images
}
except:
return None
# ==============================
# UI
# ==============================
st.title("๐ŸŒEfficient Website Crawler")
# ==============================
# STEP 1: ENTER URL
# ==============================
url = st.text_input("๐Ÿ”— Enter Website URL")
if st.button("Crawl Website"):
links = crawl_website(url)
if links:
st.session_state.links = links
st.success(f"Found {len(links)} pages")
else:
st.error("No links found")
# ==============================
# STEP 2: SELECT PAGES
# ==============================
selected_links = []
if st.session_state.links:
st.subheader("๐Ÿ“„ Select Pages to Crawl")
for link in st.session_state.links:
if st.checkbox(link):
selected_links.append(link)
# ==============================
# STEP 3: EXTRACT DATA
# ==============================
if st.button("Extract Selected Pages"):
all_data = []
with st.spinner("Extracting content..."):
for link in selected_links:
data = extract_page(link)
if data:
all_data.append(data)
if all_data:
st.session_state.data = all_data
st.success("โœ… Data extracted successfully!")
else:
st.warning("No data extracted")
# ==============================
# STEP 4: SHOW DATA
# ==============================
if st.session_state.data:
st.subheader("๐Ÿ“Š Extracted Data Preview")
df = pd.DataFrame(st.session_state.data)
st.dataframe(df)
# ==============================
# STEP 5: DOWNLOAD OPTIONS
# ==============================
if st.session_state.data:
st.subheader("โฌ‡๏ธ Download Data")
df = pd.DataFrame(st.session_state.data)
# CSV
csv = df.to_csv(index=False).encode("utf-8")
st.download_button("Download CSV", csv, "website_data.csv", "text/csv")
# JSON
json_data = df.to_json(orient="records", indent=2)
st.download_button("Download JSON", json_data, "website_data.json", "application/json")