Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import json | |
| import time | |
| from datetime import datetime | |
| import requests | |
| from urllib.parse import urlparse | |
| import io | |
| import base64 | |
| from scraper import scraper | |
| from youtube_scraper import youtube_scraper | |
| from instagram_scraper import instagram_scraper | |
| from instagram_scraper_v2 import instagram_scraper_v2 | |
| # Page configuration | |
| st.set_page_config( | |
| page_title="Scrape Anythings", | |
| page_icon="🕷️", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS for better styling | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 2.5rem; | |
| font-weight: bold; | |
| color: #1f77b4; | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| } | |
| .sub-header { | |
| font-size: 1.2rem; | |
| color: #666; | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| } | |
| .metric-card { | |
| background-color: #f0f2f6; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| border-left: 4px solid #1f77b4; | |
| } | |
| .success-box { | |
| background-color: #d4edda; | |
| border: 1px solid #c3e6cb; | |
| border-radius: 0.5rem; | |
| padding: 1rem; | |
| margin: 1rem 0; | |
| } | |
| .error-box { | |
| background-color: #f8d7da; | |
| border: 1px solid #f5c6cb; | |
| border-radius: 0.5rem; | |
| padding: 1rem; | |
| margin: 1rem 0; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| def validate_url(url): | |
| """Validate if the URL is properly formatted""" | |
| try: | |
| result = urlparse(url) | |
| return all([result.scheme, result.netloc]) | |
| except: | |
| return False | |
| def perform_web_scraping(url, data_types, max_pages=1, rate_limit=2): | |
| """ | |
| Perform actual web scraping using the WebScraper class | |
| """ | |
| st.info("🔍 Starting web scraping...") | |
| data_types_lower = [dt.lower() for dt in data_types] | |
| with st.spinner("Crawling website..."): | |
| scraped_data = scraper.scrape_website(url, data_types_lower, max_pages, rate_limit) | |
| return scraped_data | |
| def display_results(scraped_data, is_youtube=False, is_instagram=False): | |
| """Display the scraped data in a user-friendly format""" | |
| if is_youtube: | |
| display_youtube_results(scraped_data) | |
| elif is_instagram: | |
| display_instagram_results(scraped_data) | |
| else: | |
| display_regular_results(scraped_data) | |
| def display_text_results(text_data): | |
| st.write(f"**Title:** {text_data.get('title', 'N/A')}") | |
| with st.expander("Headings"): | |
| for heading in text_data.get("headings", []): | |
| st.write(f"- **{heading.get('level', 'h?')}**: {heading.get('text', '')}") | |
| with st.expander("Paragraphs"): | |
| for para in text_data.get("paragraphs", []): | |
| st.write(f"- {para}") | |
| def display_image_results(images): | |
| cols = st.columns(min(4, len(images))) | |
| for i, img in enumerate(images): | |
| with cols[i % 4]: | |
| st.image(img.get("src", ""), caption=f"{img.get('alt', 'Image')[:50]}...", use_column_width=True) | |
| def display_table_results(tables): | |
| for i, table in enumerate(tables): | |
| with st.expander(f"Table {i+1} (Header: {table.get('header', [])})"): | |
| df = pd.DataFrame(table.get('rows', [])) | |
| st.dataframe(df) | |
| def display_link_results(links): | |
| for link in links: | |
| st.write(f"- [{link.get('text', 'N/A')}]({link.get('href', '#')})") | |
| def display_metadata_results(metadata): | |
| st.json(metadata) | |
| def display_regular_results(scraped_data): | |
| """Display regular website scraping results in a structured format.""" | |
| st.subheader("📝 Text Content") | |
| if scraped_data.get("text_content"): | |
| display_text_results(scraped_data["text_content"]) | |
| else: | |
| st.info("No text content was extracted.") | |
| st.subheader("🖼️ Images") | |
| if scraped_data.get("images"): | |
| display_image_results(scraped_data["images"]) | |
| else: | |
| st.info("No images were extracted.") | |
| st.subheader("🔢 Numbers") | |
| if scraped_data.get("numbers"): | |
| with st.expander("Extracted Numbers", expanded=False): | |
| st.write(scraped_data["numbers"]) | |
| else: | |
| st.info("No numbers were extracted.") | |
| st.subheader("📊 Tables") | |
| if scraped_data.get("tables"): | |
| display_table_results(scraped_data["tables"]) | |
| else: | |
| st.info("No tables were extracted.") | |
| st.subheader("🔗 Links") | |
| if scraped_data.get("links"): | |
| display_link_results(scraped_data["links"]) | |
| else: | |
| st.info("No links were extracted.") | |
| st.subheader("📄 Metadata") | |
| if scraped_data.get("metadata"): | |
| display_metadata_results(scraped_data["metadata"]) | |
| else: | |
| st.info("No metadata was extracted.") | |
| def to_excel(data): | |
| """Converts a dictionary of scraped data to an Excel file in memory.""" | |
| output = io.BytesIO() | |
| with pd.ExcelWriter(output, engine='openpyxl') as writer: | |
| # Handle simple lists (links, images, numbers) | |
| for key in ["links", "images", "numbers"]: | |
| if data.get(key): | |
| pd.DataFrame({key.capitalize(): data[key]}).to_excel(writer, sheet_name=key.capitalize(), index=False) | |
| # Handle text content | |
| if data.get("text_content"): | |
| pd.DataFrame({'Text': [data["text_content"]]}).to_excel(writer, sheet_name='Text', index=False) | |
| # Handle dictionaries (metadata, video_info, profile_info) | |
| for key in ["metadata", "video_info", "profile_info"]: | |
| if data.get(key): | |
| pd.DataFrame(data[key].items(), columns=['Property', 'Value']).to_excel(writer, sheet_name=key.replace('_', ' ').capitalize(), index=False) | |
| # Handle list of dictionaries (comments) | |
| if data.get("comments"): | |
| pd.DataFrame(data["comments"]).to_excel(writer, sheet_name='Comments', index=False) | |
| # Handle list of DataFrames (tables) | |
| if data.get("tables"): | |
| for i, table_df in enumerate(data["tables"]): | |
| table_df.to_excel(writer, sheet_name=f'Table_{i+1}', index=False) | |
| processed_data = output.getvalue() | |
| return processed_data | |
| def create_download_links(scraped_data): | |
| """Create download links for different formats""" | |
| st.header("Download Data") | |
| col1, col2, col3, col4 = st.columns(4) | |
| # JSON download | |
| with col1: | |
| json_str = json.dumps(scraped_data or {}, indent=2, default=str) | |
| st.download_button( | |
| label="Download JSON", | |
| data=json_str, | |
| file_name="scraped_data.json", | |
| mime="application/json", | |
| use_container_width=True | |
| ) | |
| # CSV download | |
| with col2: | |
| if scraped_data.get("tables"): | |
| # For simplicity, we'll offer the first table as a CSV download | |
| csv = scraped_data["tables"][0].to_csv(index=False) | |
| st.download_button( | |
| label="Download CSV", | |
| data=csv, | |
| file_name="scraped_table.csv", | |
| mime="text/csv", | |
| use_container_width=True | |
| ) | |
| else: | |
| st.button("Download CSV", disabled=True, help="No tables found to download.", use_container_width=True) | |
| # TXT download | |
| with col3: | |
| text_content = scraped_data.get("text_content", "") | |
| st.download_button( | |
| label="Download TXT", | |
| data=text_content, | |
| file_name="scraped_text.txt", | |
| mime="text/plain", | |
| use_container_width=True | |
| ) | |
| # Excel download | |
| with col4: | |
| try: | |
| excel_data = to_excel(scraped_data) | |
| st.download_button( | |
| label="Download Excel", | |
| data=excel_data, | |
| file_name="scraped_data.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| use_container_width=True | |
| ) | |
| except Exception as e: | |
| st.button("Download Excel", disabled=True, help=f"Excel export failed: {e}", use_container_width=True) | |
| for heading in text_data.get("headings", []): | |
| txt_content += f"- {heading}\n" | |
| txt_content += "\nParagraphs:\n" | |
| for i, para in enumerate(text_data.get("paragraphs", []), 1): | |
| txt_content += f"{i}. {para}\n" | |
| b64_txt = base64.b64encode(txt_content.encode()).decode() | |
| href = f'<a href="data:file/txt;base64,{b64_txt}" download="scraped_data.txt">📝 Download TXT</a>' | |
| st.markdown(href, unsafe_allow_html=True) | |
| # Excel download | |
| with col4: | |
| try: | |
| excel_data = to_excel(scraped_data) | |
| st.download_button( | |
| label="Download data as Excel", | |
| data=excel_data, | |
| file_name="scraped_data.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |
| except Exception as e: | |
| st.error(f"Failed to generate Excel file: {e}") | |
| def display_youtube_results(scraped_data): | |
| """Display YouTube scraping results""" | |
| if not scraped_data.get("video_info"): | |
| st.error("Could not extract YouTube video information.") | |
| return | |
| video_info = scraped_data["video_info"] | |
| st.subheader(f'{video_info.get("title", "Untitled")}') | |
| st.write(f'**Channel:** {video_info.get("channel", "N/A")}') | |
| st.write(f'**Views:** {video_info.get("views", "N/A")}') | |
| with st.expander("Video Description"): | |
| st.write(video_info.get("description", "No description.")) | |
| if "comments" in scraped_data and scraped_data["comments"]: | |
| with st.expander(f'Comments ({len(scraped_data["comments"])})'): | |
| for comment in scraped_data["comments"]: | |
| st.markdown(f"**{comment.get('author', 'Unknown')}** - {comment.get('timestamp', 'Unknown')}") | |
| st.write(comment.get('text', '')) | |
| if comment.get('likes', '0') != '0': | |
| st.caption(f"👍 {comment.get('likes', '0')} likes") | |
| st.divider() | |
| def display_instagram_results(scraped_data): | |
| """Display Instagram scraping results""" | |
| if not scraped_data.get("profile_info"): | |
| st.error("Could not extract Instagram profile information.") | |
| return | |
| profile_info = scraped_data["profile_info"] | |
| with st.expander("Profile Information", expanded=True): | |
| st.write(f'**Username:** {profile_info.get("username", "N/A")}') | |
| st.write(f'**Display Name:** {profile_info.get("display_name", "N/A")}') | |
| st.write(f'**Bio:** {profile_info.get("bio", "N/A")}') | |
| st.write(f'**Followers:** {profile_info.get("followers", "N/A")}') | |
| def main(): | |
| # Header | |
| st.markdown('<h1 class="main-header">✨ Scrape Anythings</h1>', unsafe_allow_html=True) | |
| st.markdown('<p class="sub-header">Extract data from any website with ease</p>', unsafe_allow_html=True) | |
| # Sidebar for configuration | |
| with st.sidebar: | |
| st.header("Configuration") | |
| url = st.text_input("Enter Website URL", placeholder="https://example.com") | |
| is_youtube = "youtube.com" in url.lower() or "youtu.be" in url.lower() if url else False | |
| is_instagram = "instagram.com" in url.lower() if url else False | |
| data_types, youtube_data_types, instagram_data_types, max_comments = [], [], [], 50 | |
| if is_youtube: | |
| st.info("YouTube URL detected!") | |
| youtube_data_types = st.multiselect("YouTube Data Types", ["video_info", "comments"], default=["video_info", "comments"]) | |
| if "comments" in youtube_data_types: | |
| max_comments = st.slider("Max Comments", 10, 200, 50) | |
| elif is_instagram: | |
| st.info("Instagram URL detected!") | |
| instagram_data_types = st.multiselect("Instagram Data Types", ["profile_info", "images", "posts"], default=["profile_info", "images"]) | |
| else: | |
| data_types = st.multiselect("Data Types", ["Text", "Images", "Links", "Tables", "Metadata", "Numbers"], default=["Text", "Links"]) | |
| st.subheader("Advanced Options") | |
| max_pages = st.slider("Max Pages", 1, 10, 1) | |
| rate_limit = st.slider("Rate Limit (s)", 1, 10, 2) | |
| scrape_button = st.button("Start Scraping", type="primary", use_container_width=True) | |
| # Main content area | |
| if scrape_button: | |
| if not url or not validate_url(url): | |
| st.error("Please enter a valid URL.") | |
| return | |
| # Validate that at least one data type is selected for the given URL type | |
| if is_youtube and not youtube_data_types: | |
| st.error("Please select at least one YouTube data type to extract.") | |
| return | |
| elif is_instagram and not instagram_data_types: | |
| st.error("Please select at least one Instagram data type to extract.") | |
| return | |
| elif not is_youtube and not is_instagram and not data_types: | |
| st.error("Please select at least one data type to extract.") | |
| return | |
| with st.spinner("Scraping in progress... Please wait."): | |
| try: | |
| scraped_data = {} | |
| if is_youtube: | |
| scraped_data = youtube_scraper.scrape_youtube_video(url, "comments" in youtube_data_types, max_comments) | |
| elif is_instagram: | |
| try: | |
| scraped_data = instagram_scraper_v2.extract_instagram_data(url) | |
| except Exception: | |
| st.warning("Improved scraper failed, trying fallback...") | |
| scraped_data = instagram_scraper.extract_instagram_data(url) | |
| else: | |
| data_types_lower = [dt.lower() for dt in data_types] | |
| scraped_data = perform_web_scraping(url, data_types_lower, max_pages, rate_limit) | |
| if scraped_data.get("errors"): | |
| st.error(f'Errors: {scraped_data["errors"]}') | |
| # Check if any data was actually scraped before showing success | |
| has_data = any(scraped_data.get(key) for key in ["text_content", "images", "numbers", "tables", "links", "metadata", "video_info", "profile_info"]) | |
| if has_data: | |
| st.success("Scraping completed successfully!") | |
| st.header("Scraping Results") | |
| display_results(scraped_data, is_youtube, is_instagram) | |
| st.header("Download Data") | |
| create_download_links(scraped_data) | |
| else: | |
| st.warning("No data was extracted. The website might be blocking scrapers or the content is not available.") | |
| except Exception as e: | |
| st.error(f"An unexpected error occurred: {e}") | |
| else: | |
| st.markdown(""" | |
| ### How to Use | |
| 1. **Enter URL** and **select data types** in the sidebar. | |
| 2. Click **Start Scraping** to begin. | |
| 3. View and **download the results** below. | |
| """) | |
| if __name__ == "__main__": | |
| main() |