Engineer786 commited on
Commit
4baad7f
·
verified ·
1 Parent(s): 4146964

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import urllib3
5
+ import pandas as pd
6
+ import tempfile
7
+
8
+ def simple_web_scraper(url, scrape_option):
9
+ try:
10
+ # Create a PoolManager with urllib3 to handle SSL
11
+ http = urllib3.PoolManager()
12
+
13
+ # Send an HTTP request
14
+ response = http.request('GET', url)
15
+
16
+ # Check if the request was successful (status code 200)
17
+ if response.status == 200:
18
+ # Parse the HTML content of the page
19
+ soup = BeautifulSoup(response.data, 'html.parser')
20
+
21
+ # Extract information from the HTML based on user's choice
22
+ if scrape_option == 'data':
23
+ # Extract all text content from the page
24
+ all_text = soup.get_text()
25
+
26
+ # Prepare data for the table (split text by lines)
27
+ table_data = [{'Data': line.strip()} for line in all_text.split('\n') if line.strip()]
28
+
29
+ # Display the data in a table
30
+ st.table(table_data)
31
+
32
+ # Save data to a temporary CSV file
33
+ df = pd.DataFrame(table_data)
34
+ csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
35
+ df.to_csv(csv_file.name, index=False)
36
+
37
+ # Provide a download button for the CSV file
38
+ st.download_button(
39
+ label="Download Data as CSV",
40
+ data=open(csv_file.name, "rb").read(),
41
+ file_name="scraped_data.csv",
42
+ mime="text/csv",
43
+ )
44
+ elif scrape_option == 'links':
45
+ # Example: Extract all the links on the page
46
+ links = soup.find_all('a')
47
+
48
+ # Prepare data for the table
49
+ table_data = [{'Links': link.get('href')} for link in links if link.get('href')]
50
+
51
+ # Display the data in a table
52
+ st.table(table_data)
53
+
54
+ # Save links to a temporary CSV file
55
+ df = pd.DataFrame(table_data)
56
+ csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
57
+ df.to_csv(csv_file.name, index=False)
58
+
59
+ # Provide a download button for the CSV file
60
+ st.download_button(
61
+ label="Download Links as CSV",
62
+ data=open(csv_file.name, "rb").read(),
63
+ file_name="scraped_links.csv",
64
+ mime="text/csv",
65
+ )
66
+ else:
67
+ st.write('Invalid scrape option. Please choose "data" or "links".')
68
+ else:
69
+ st.write(f'Error: {response.status}')
70
+
71
+ except Exception as e:
72
+ st.write(f'An error occurred: {e}')
73
+
74
+ # Streamlit UI
75
+ st.title("Web Scraping Tool")
76
+ website_url = st.text_input("Enter the URL to scrape:")
77
+ scrape_option = st.selectbox("Select what to scrape:", ['data', 'links'])
78
+
79
+ if st.button("Scrape"):
80
+ simple_web_scraper(website_url, scrape_option)