Spaces:

ZainabEman
/

Customizable-Web-Scrapper

Sleeping

App Files Files Community

Customizable-Web-Scrapper / app.py

ZainabEman

Upload 2 files

8670562 verified about 1 year ago

raw

history blame contribute delete

3.71 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import json
	from requests.exceptions import RequestException

	# Function to analyze available tags on the page
	def analyze_page(url):
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status() # Raise an error for bad status codes
	soup = BeautifulSoup(response.content, 'html.parser')
	useful_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'img', 'ul', 'ol', 'li'}
	available_tags = {tag.name for tag in soup.find_all(True) if tag.name in useful_tags}
	return list(available_tags)
	except RequestException as e:
	st.error(f"Failed to fetch the page: {str(e)}")
	return None

	# Function to scrape selected tags
	def scrape_data(url, selected_tags):
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')
	data = []
	for tag in selected_tags:
	for item in soup.find_all(tag):
	if tag == 'img':
	data.append({'Type': tag, 'Src': item.get('src', ''), 'Alt Text': item.get('alt', '')})
	elif tag == 'a':
	data.append({'Type': tag, 'URL': item.get('href', ''), 'Text': item.get_text(strip=True)})
	else:
	data.append({'Type': tag, 'Content': item.get_text(strip=True)})
	return pd.DataFrame(data)
	except RequestException as e:
	st.error(f"Error scraping data: {str(e)}")
	return None

	# Main Streamlit app
	def main():
	st.title("Live Web Scraper")
	st.write("Enter a URL and select the HTML tags you want to scrape!")

	# URL input
	url = st.text_input("Enter the URL to scrape:", key="url_input")

	if url:
	with st.spinner("Analyzing the page..."):
	available_tags = analyze_page(url)

	if available_tags:
	# Tag selection
	selected_tags = st.multiselect("Select tags to scrape:", available_tags, key="tag_select")

	if st.button("Scrape Now", key="scrape_button"):
	with st.spinner("Scraping data..."):
	df = scrape_data(url, selected_tags)

	if df is not None and not df.empty:
	st.success("Scraping complete!")
	st.write("### Scraped Data:")
	st.dataframe(df)

	# Download options
	col1, col2 = st.columns(2)
	with col1:
	csv = df.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download as CSV",
	data=csv,
	file_name="scraped_data.csv",
	mime="text/csv",
	key="csv_download"
	)
	with col2:
	json_data = df.to_json(orient='records')
	st.download_button(
	label="Download as JSON",
	data=json_data,
	file_name="scraped_data.json",
	mime="application/json",
	key="json_download"
	)
	else:
	st.warning("No data found for the selected tags.")
	else:
	st.error("Couldn’t analyze the page. Check the URL and try again.")

	if __name__ == "__main__":
	main()