Spaces:
Sleeping
Sleeping
| # ========================================================= | |
| # ๐ WEBSITE CRAWLER + DOWNLOAD TOOL | |
| # ========================================================= | |
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| from urllib.parse import urljoin | |
| # ============================== | |
| # PAGE CONFIG | |
| # ============================== | |
| st.set_page_config(page_title="๐ Website Crawler", layout="wide") | |
| # ============================== | |
| # SESSION STATE | |
| # ============================== | |
| if "links" not in st.session_state: | |
| st.session_state.links = [] | |
| if "data" not in st.session_state: | |
| st.session_state.data = [] | |
| # ============================== | |
| # CRAWL WEBSITE | |
| # ============================== | |
| def crawl_website(url): | |
| try: | |
| res = requests.get(url, timeout=10) | |
| soup = BeautifulSoup(res.text, "html.parser") | |
| links = set() | |
| for a in soup.find_all("a", href=True): | |
| link = urljoin(url, a["href"]) | |
| if link.startswith("http"): | |
| links.add(link) | |
| return list(links)[:30] | |
| except: | |
| return [] | |
| # ============================== | |
| # EXTRACT PAGE CONTENT | |
| # ============================== | |
| def extract_page(url): | |
| try: | |
| res = requests.get(url, timeout=10) | |
| soup = BeautifulSoup(res.text, "html.parser") | |
| # TEXT | |
| paragraphs = [p.get_text().strip() for p in soup.find_all("p")] | |
| text = " ".join(paragraphs) | |
| # IMAGES | |
| images = [] | |
| for img in soup.find_all("img"): | |
| img_url = urljoin(url, img.get("src")) | |
| images.append(img_url) | |
| return { | |
| "url": url, | |
| "text": text, | |
| "images": images | |
| } | |
| except: | |
| return None | |
| # ============================== | |
| # UI | |
| # ============================== | |
| st.title("๐Efficient Website Crawler") | |
| # ============================== | |
| # STEP 1: ENTER URL | |
| # ============================== | |
| url = st.text_input("๐ Enter Website URL") | |
| if st.button("Crawl Website"): | |
| links = crawl_website(url) | |
| if links: | |
| st.session_state.links = links | |
| st.success(f"Found {len(links)} pages") | |
| else: | |
| st.error("No links found") | |
| # ============================== | |
| # STEP 2: SELECT PAGES | |
| # ============================== | |
| selected_links = [] | |
| if st.session_state.links: | |
| st.subheader("๐ Select Pages to Crawl") | |
| for link in st.session_state.links: | |
| if st.checkbox(link): | |
| selected_links.append(link) | |
| # ============================== | |
| # STEP 3: EXTRACT DATA | |
| # ============================== | |
| if st.button("Extract Selected Pages"): | |
| all_data = [] | |
| with st.spinner("Extracting content..."): | |
| for link in selected_links: | |
| data = extract_page(link) | |
| if data: | |
| all_data.append(data) | |
| if all_data: | |
| st.session_state.data = all_data | |
| st.success("โ Data extracted successfully!") | |
| else: | |
| st.warning("No data extracted") | |
| # ============================== | |
| # STEP 4: SHOW DATA | |
| # ============================== | |
| if st.session_state.data: | |
| st.subheader("๐ Extracted Data Preview") | |
| df = pd.DataFrame(st.session_state.data) | |
| st.dataframe(df) | |
| # ============================== | |
| # STEP 5: DOWNLOAD OPTIONS | |
| # ============================== | |
| if st.session_state.data: | |
| st.subheader("โฌ๏ธ Download Data") | |
| df = pd.DataFrame(st.session_state.data) | |
| # CSV | |
| csv = df.to_csv(index=False).encode("utf-8") | |
| st.download_button("Download CSV", csv, "website_data.csv", "text/csv") | |
| # JSON | |
| json_data = df.to_json(orient="records", indent=2) | |
| st.download_button("Download JSON", json_data, "website_data.json", "application/json") |