ZainabEman commited on
Commit
8670562
·
verified ·
1 Parent(s): 967f04c

Upload 2 files

Browse files
Files changed (2) hide show
  1. Requirements.txt +4 -0
  2. app.py +92 -0
Requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ requests
3
+ beautifulsoup4
4
+ pandas
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ import json
6
+ from requests.exceptions import RequestException
7
+
8
+ # Function to analyze available tags on the page
9
+ def analyze_page(url):
10
+ try:
11
+ response = requests.get(url, timeout=10)
12
+ response.raise_for_status() # Raise an error for bad status codes
13
+ soup = BeautifulSoup(response.content, 'html.parser')
14
+ useful_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'img', 'ul', 'ol', 'li'}
15
+ available_tags = {tag.name for tag in soup.find_all(True) if tag.name in useful_tags}
16
+ return list(available_tags)
17
+ except RequestException as e:
18
+ st.error(f"Failed to fetch the page: {str(e)}")
19
+ return None
20
+
21
+ # Function to scrape selected tags
22
+ def scrape_data(url, selected_tags):
23
+ try:
24
+ response = requests.get(url, timeout=10)
25
+ response.raise_for_status()
26
+ soup = BeautifulSoup(response.content, 'html.parser')
27
+ data = []
28
+ for tag in selected_tags:
29
+ for item in soup.find_all(tag):
30
+ if tag == 'img':
31
+ data.append({'Type': tag, 'Src': item.get('src', ''), 'Alt Text': item.get('alt', '')})
32
+ elif tag == 'a':
33
+ data.append({'Type': tag, 'URL': item.get('href', ''), 'Text': item.get_text(strip=True)})
34
+ else:
35
+ data.append({'Type': tag, 'Content': item.get_text(strip=True)})
36
+ return pd.DataFrame(data)
37
+ except RequestException as e:
38
+ st.error(f"Error scraping data: {str(e)}")
39
+ return None
40
+
41
+ # Main Streamlit app
42
+ def main():
43
+ st.title("Live Web Scraper")
44
+ st.write("Enter a URL and select the HTML tags you want to scrape!")
45
+
46
+ # URL input
47
+ url = st.text_input("Enter the URL to scrape:", key="url_input")
48
+
49
+ if url:
50
+ with st.spinner("Analyzing the page..."):
51
+ available_tags = analyze_page(url)
52
+
53
+ if available_tags:
54
+ # Tag selection
55
+ selected_tags = st.multiselect("Select tags to scrape:", available_tags, key="tag_select")
56
+
57
+ if st.button("Scrape Now", key="scrape_button"):
58
+ with st.spinner("Scraping data..."):
59
+ df = scrape_data(url, selected_tags)
60
+
61
+ if df is not None and not df.empty:
62
+ st.success("Scraping complete!")
63
+ st.write("### Scraped Data:")
64
+ st.dataframe(df)
65
+
66
+ # Download options
67
+ col1, col2 = st.columns(2)
68
+ with col1:
69
+ csv = df.to_csv(index=False).encode('utf-8')
70
+ st.download_button(
71
+ label="Download as CSV",
72
+ data=csv,
73
+ file_name="scraped_data.csv",
74
+ mime="text/csv",
75
+ key="csv_download"
76
+ )
77
+ with col2:
78
+ json_data = df.to_json(orient='records')
79
+ st.download_button(
80
+ label="Download as JSON",
81
+ data=json_data,
82
+ file_name="scraped_data.json",
83
+ mime="application/json",
84
+ key="json_download"
85
+ )
86
+ else:
87
+ st.warning("No data found for the selected tags.")
88
+ else:
89
+ st.error("Couldn’t analyze the page. Check the URL and try again.")
90
+
91
+ if __name__ == "__main__":
92
+ main()