# ========================================================= # 🌐 WEBSITE CRAWLER + DOWNLOAD TOOL # ========================================================= import streamlit as st import requests from bs4 import BeautifulSoup import pandas as pd from urllib.parse import urljoin # ============================== # PAGE CONFIG # ============================== st.set_page_config(page_title="🌐 Website Crawler", layout="wide") # ============================== # SESSION STATE # ============================== if "links" not in st.session_state: st.session_state.links = [] if "data" not in st.session_state: st.session_state.data = [] # ============================== # CRAWL WEBSITE # ============================== def crawl_website(url): try: res = requests.get(url, timeout=10) soup = BeautifulSoup(res.text, "html.parser") links = set() for a in soup.find_all("a", href=True): link = urljoin(url, a["href"]) if link.startswith("http"): links.add(link) return list(links)[:30] except: return [] # ============================== # EXTRACT PAGE CONTENT # ============================== def extract_page(url): try: res = requests.get(url, timeout=10) soup = BeautifulSoup(res.text, "html.parser") # TEXT paragraphs = [p.get_text().strip() for p in soup.find_all("p")] text = " ".join(paragraphs) # IMAGES images = [] for img in soup.find_all("img"): img_url = urljoin(url, img.get("src")) images.append(img_url) return { "url": url, "text": text, "images": images } except: return None # ============================== # UI # ============================== st.title("🌐Efficient Website Crawler") # ============================== # STEP 1: ENTER URL # ============================== url = st.text_input("πŸ”— Enter Website URL") if st.button("Crawl Website"): links = crawl_website(url) if links: st.session_state.links = links st.success(f"Found {len(links)} pages") else: st.error("No links found") # ============================== # STEP 2: SELECT PAGES # ============================== selected_links = [] if st.session_state.links: st.subheader("πŸ“„ Select Pages to Crawl") for link in st.session_state.links: if st.checkbox(link): selected_links.append(link) # ============================== # STEP 3: EXTRACT DATA # ============================== if st.button("Extract Selected Pages"): all_data = [] with st.spinner("Extracting content..."): for link in selected_links: data = extract_page(link) if data: all_data.append(data) if all_data: st.session_state.data = all_data st.success("βœ… Data extracted successfully!") else: st.warning("No data extracted") # ============================== # STEP 4: SHOW DATA # ============================== if st.session_state.data: st.subheader("πŸ“Š Extracted Data Preview") df = pd.DataFrame(st.session_state.data) st.dataframe(df) # ============================== # STEP 5: DOWNLOAD OPTIONS # ============================== if st.session_state.data: st.subheader("⬇️ Download Data") df = pd.DataFrame(st.session_state.data) # CSV csv = df.to_csv(index=False).encode("utf-8") st.download_button("Download CSV", csv, "website_data.csv", "text/csv") # JSON json_data = df.to_json(orient="records", indent=2) st.download_button("Download JSON", json_data, "website_data.json", "application/json")