Spaces:
Sleeping
Sleeping
File size: 15,031 Bytes
5c3dc0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 |
import streamlit as st
import pandas as pd
import json
import time
from datetime import datetime
import requests
from urllib.parse import urlparse
import io
import base64
from scraper import scraper
from youtube_scraper import youtube_scraper
from instagram_scraper import instagram_scraper
from instagram_scraper_v2 import instagram_scraper_v2
# Page configuration
st.set_page_config(
page_title="Scrape Anythings",
page_icon="🕷️",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for better styling
st.markdown("""
<style>
.main-header {
font-size: 2.5rem;
font-weight: bold;
color: #1f77b4;
text-align: center;
margin-bottom: 2rem;
}
.sub-header {
font-size: 1.2rem;
color: #666;
text-align: center;
margin-bottom: 2rem;
}
.metric-card {
background-color: #f0f2f6;
padding: 1rem;
border-radius: 0.5rem;
border-left: 4px solid #1f77b4;
}
.success-box {
background-color: #d4edda;
border: 1px solid #c3e6cb;
border-radius: 0.5rem;
padding: 1rem;
margin: 1rem 0;
}
.error-box {
background-color: #f8d7da;
border: 1px solid #f5c6cb;
border-radius: 0.5rem;
padding: 1rem;
margin: 1rem 0;
}
</style>
""", unsafe_allow_html=True)
def validate_url(url):
"""Validate if the URL is properly formatted"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def perform_web_scraping(url, data_types, max_pages=1, rate_limit=2):
"""
Perform actual web scraping using the WebScraper class
"""
st.info("🔍 Starting web scraping...")
data_types_lower = [dt.lower() for dt in data_types]
with st.spinner("Crawling website..."):
scraped_data = scraper.scrape_website(url, data_types_lower, max_pages, rate_limit)
return scraped_data
def display_results(scraped_data, is_youtube=False, is_instagram=False):
"""Display the scraped data in a user-friendly format"""
if is_youtube:
display_youtube_results(scraped_data)
elif is_instagram:
display_instagram_results(scraped_data)
else:
display_regular_results(scraped_data)
def display_text_results(text_data):
st.write(f"**Title:** {text_data.get('title', 'N/A')}")
with st.expander("Headings"):
for heading in text_data.get("headings", []):
st.write(f"- **{heading.get('level', 'h?')}**: {heading.get('text', '')}")
with st.expander("Paragraphs"):
for para in text_data.get("paragraphs", []):
st.write(f"- {para}")
def display_image_results(images):
cols = st.columns(min(4, len(images)))
for i, img in enumerate(images):
with cols[i % 4]:
st.image(img.get("src", ""), caption=f"{img.get('alt', 'Image')[:50]}...", use_column_width=True)
def display_table_results(tables):
for i, table in enumerate(tables):
with st.expander(f"Table {i+1} (Header: {table.get('header', [])})"):
df = pd.DataFrame(table.get('rows', []))
st.dataframe(df)
def display_link_results(links):
for link in links:
st.write(f"- [{link.get('text', 'N/A')}]({link.get('href', '#')})")
def display_metadata_results(metadata):
st.json(metadata)
def display_regular_results(scraped_data):
"""Display regular website scraping results in a structured format."""
st.subheader("📝 Text Content")
if scraped_data.get("text_content"):
display_text_results(scraped_data["text_content"])
else:
st.info("No text content was extracted.")
st.subheader("🖼️ Images")
if scraped_data.get("images"):
display_image_results(scraped_data["images"])
else:
st.info("No images were extracted.")
st.subheader("🔢 Numbers")
if scraped_data.get("numbers"):
with st.expander("Extracted Numbers", expanded=False):
st.write(scraped_data["numbers"])
else:
st.info("No numbers were extracted.")
st.subheader("📊 Tables")
if scraped_data.get("tables"):
display_table_results(scraped_data["tables"])
else:
st.info("No tables were extracted.")
st.subheader("🔗 Links")
if scraped_data.get("links"):
display_link_results(scraped_data["links"])
else:
st.info("No links were extracted.")
st.subheader("📄 Metadata")
if scraped_data.get("metadata"):
display_metadata_results(scraped_data["metadata"])
else:
st.info("No metadata was extracted.")
def to_excel(data):
"""Converts a dictionary of scraped data to an Excel file in memory."""
output = io.BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
# Handle simple lists (links, images, numbers)
for key in ["links", "images", "numbers"]:
if data.get(key):
pd.DataFrame({key.capitalize(): data[key]}).to_excel(writer, sheet_name=key.capitalize(), index=False)
# Handle text content
if data.get("text_content"):
pd.DataFrame({'Text': [data["text_content"]]}).to_excel(writer, sheet_name='Text', index=False)
# Handle dictionaries (metadata, video_info, profile_info)
for key in ["metadata", "video_info", "profile_info"]:
if data.get(key):
pd.DataFrame(data[key].items(), columns=['Property', 'Value']).to_excel(writer, sheet_name=key.replace('_', ' ').capitalize(), index=False)
# Handle list of dictionaries (comments)
if data.get("comments"):
pd.DataFrame(data["comments"]).to_excel(writer, sheet_name='Comments', index=False)
# Handle list of DataFrames (tables)
if data.get("tables"):
for i, table_df in enumerate(data["tables"]):
table_df.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
processed_data = output.getvalue()
return processed_data
def create_download_links(scraped_data):
"""Create download links for different formats"""
st.header("Download Data")
col1, col2, col3, col4 = st.columns(4)
# JSON download
with col1:
json_str = json.dumps(scraped_data or {}, indent=2, default=str)
st.download_button(
label="Download JSON",
data=json_str,
file_name="scraped_data.json",
mime="application/json",
use_container_width=True
)
# CSV download
with col2:
if scraped_data.get("tables"):
# For simplicity, we'll offer the first table as a CSV download
csv = scraped_data["tables"][0].to_csv(index=False)
st.download_button(
label="Download CSV",
data=csv,
file_name="scraped_table.csv",
mime="text/csv",
use_container_width=True
)
else:
st.button("Download CSV", disabled=True, help="No tables found to download.", use_container_width=True)
# TXT download
with col3:
text_content = scraped_data.get("text_content", "")
st.download_button(
label="Download TXT",
data=text_content,
file_name="scraped_text.txt",
mime="text/plain",
use_container_width=True
)
# Excel download
with col4:
try:
excel_data = to_excel(scraped_data)
st.download_button(
label="Download Excel",
data=excel_data,
file_name="scraped_data.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
use_container_width=True
)
except Exception as e:
st.button("Download Excel", disabled=True, help=f"Excel export failed: {e}", use_container_width=True)
for heading in text_data.get("headings", []):
txt_content += f"- {heading}\n"
txt_content += "\nParagraphs:\n"
for i, para in enumerate(text_data.get("paragraphs", []), 1):
txt_content += f"{i}. {para}\n"
b64_txt = base64.b64encode(txt_content.encode()).decode()
href = f'<a href="data:file/txt;base64,{b64_txt}" download="scraped_data.txt">📝 Download TXT</a>'
st.markdown(href, unsafe_allow_html=True)
# Excel download
with col4:
try:
excel_data = to_excel(scraped_data)
st.download_button(
label="Download data as Excel",
data=excel_data,
file_name="scraped_data.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
except Exception as e:
st.error(f"Failed to generate Excel file: {e}")
def display_youtube_results(scraped_data):
"""Display YouTube scraping results"""
if not scraped_data.get("video_info"):
st.error("Could not extract YouTube video information.")
return
video_info = scraped_data["video_info"]
st.subheader(f'{video_info.get("title", "Untitled")}')
st.write(f'**Channel:** {video_info.get("channel", "N/A")}')
st.write(f'**Views:** {video_info.get("views", "N/A")}')
with st.expander("Video Description"):
st.write(video_info.get("description", "No description."))
if "comments" in scraped_data and scraped_data["comments"]:
with st.expander(f'Comments ({len(scraped_data["comments"])})'):
for comment in scraped_data["comments"]:
st.markdown(f"**{comment.get('author', 'Unknown')}** - {comment.get('timestamp', 'Unknown')}")
st.write(comment.get('text', ''))
if comment.get('likes', '0') != '0':
st.caption(f"👍 {comment.get('likes', '0')} likes")
st.divider()
def display_instagram_results(scraped_data):
"""Display Instagram scraping results"""
if not scraped_data.get("profile_info"):
st.error("Could not extract Instagram profile information.")
return
profile_info = scraped_data["profile_info"]
with st.expander("Profile Information", expanded=True):
st.write(f'**Username:** {profile_info.get("username", "N/A")}')
st.write(f'**Display Name:** {profile_info.get("display_name", "N/A")}')
st.write(f'**Bio:** {profile_info.get("bio", "N/A")}')
st.write(f'**Followers:** {profile_info.get("followers", "N/A")}')
def main():
# Header
st.markdown('<h1 class="main-header">✨ Scrape Anythings</h1>', unsafe_allow_html=True)
st.markdown('<p class="sub-header">Extract data from any website with ease</p>', unsafe_allow_html=True)
# Sidebar for configuration
with st.sidebar:
st.header("Configuration")
url = st.text_input("Enter Website URL", placeholder="https://example.com")
is_youtube = "youtube.com" in url.lower() or "youtu.be" in url.lower() if url else False
is_instagram = "instagram.com" in url.lower() if url else False
data_types, youtube_data_types, instagram_data_types, max_comments = [], [], [], 50
if is_youtube:
st.info("YouTube URL detected!")
youtube_data_types = st.multiselect("YouTube Data Types", ["video_info", "comments"], default=["video_info", "comments"])
if "comments" in youtube_data_types:
max_comments = st.slider("Max Comments", 10, 200, 50)
elif is_instagram:
st.info("Instagram URL detected!")
instagram_data_types = st.multiselect("Instagram Data Types", ["profile_info", "images", "posts"], default=["profile_info", "images"])
else:
data_types = st.multiselect("Data Types", ["Text", "Images", "Links", "Tables", "Metadata", "Numbers"], default=["Text", "Links"])
st.subheader("Advanced Options")
max_pages = st.slider("Max Pages", 1, 10, 1)
rate_limit = st.slider("Rate Limit (s)", 1, 10, 2)
scrape_button = st.button("Start Scraping", type="primary", use_container_width=True)
# Main content area
if scrape_button:
if not url or not validate_url(url):
st.error("Please enter a valid URL.")
return
# Validate that at least one data type is selected for the given URL type
if is_youtube and not youtube_data_types:
st.error("Please select at least one YouTube data type to extract.")
return
elif is_instagram and not instagram_data_types:
st.error("Please select at least one Instagram data type to extract.")
return
elif not is_youtube and not is_instagram and not data_types:
st.error("Please select at least one data type to extract.")
return
with st.spinner("Scraping in progress... Please wait."):
try:
scraped_data = {}
if is_youtube:
scraped_data = youtube_scraper.scrape_youtube_video(url, "comments" in youtube_data_types, max_comments)
elif is_instagram:
try:
scraped_data = instagram_scraper_v2.extract_instagram_data(url)
except Exception:
st.warning("Improved scraper failed, trying fallback...")
scraped_data = instagram_scraper.extract_instagram_data(url)
else:
data_types_lower = [dt.lower() for dt in data_types]
scraped_data = perform_web_scraping(url, data_types_lower, max_pages, rate_limit)
if scraped_data.get("errors"):
st.error(f'Errors: {scraped_data["errors"]}')
# Check if any data was actually scraped before showing success
has_data = any(scraped_data.get(key) for key in ["text_content", "images", "numbers", "tables", "links", "metadata", "video_info", "profile_info"])
if has_data:
st.success("Scraping completed successfully!")
st.header("Scraping Results")
display_results(scraped_data, is_youtube, is_instagram)
st.header("Download Data")
create_download_links(scraped_data)
else:
st.warning("No data was extracted. The website might be blocking scrapers or the content is not available.")
except Exception as e:
st.error(f"An unexpected error occurred: {e}")
else:
st.markdown("""
### How to Use
1. **Enter URL** and **select data types** in the sidebar.
2. Click **Start Scraping** to begin.
3. View and **download the results** below.
""")
if __name__ == "__main__":
main() |