Spaces:

Engineer786
/

WebScraper

Sleeping

App Files Files Community

Engineer786 commited on Jan 3, 2025

Commit

4baad7f

verified ·

1 Parent(s): 4146964

Create app.py

Browse files

Files changed (1) hide show

app.py +80 -0

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+import urllib3
+import pandas as pd
+import tempfile
+def simple_web_scraper(url, scrape_option):
+    try:
+        # Create a PoolManager with urllib3 to handle SSL
+        http = urllib3.PoolManager()
+        # Send an HTTP request
+        response = http.request('GET', url)
+        # Check if the request was successful (status code 200)
+        if response.status == 200:
+            # Parse the HTML content of the page
+            soup = BeautifulSoup(response.data, 'html.parser')
+            # Extract information from the HTML based on user's choice
+            if scrape_option == 'data':
+                # Extract all text content from the page
+                all_text = soup.get_text()
+                # Prepare data for the table (split text by lines)
+                table_data = [{'Data': line.strip()} for line in all_text.split('\n') if line.strip()]
+                # Display the data in a table
+                st.table(table_data)
+                # Save data to a temporary CSV file
+                df = pd.DataFrame(table_data)
+                csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+                df.to_csv(csv_file.name, index=False)
+                # Provide a download button for the CSV file
+                st.download_button(
+                    label="Download Data as CSV",
+                    data=open(csv_file.name, "rb").read(),
+                    file_name="scraped_data.csv",
+                    mime="text/csv",
+                )
+            elif scrape_option == 'links':
+                # Example: Extract all the links on the page
+                links = soup.find_all('a')
+                # Prepare data for the table
+                table_data = [{'Links': link.get('href')} for link in links if link.get('href')]
+                # Display the data in a table
+                st.table(table_data)
+                # Save links to a temporary CSV file
+                df = pd.DataFrame(table_data)
+                csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+                df.to_csv(csv_file.name, index=False)
+                # Provide a download button for the CSV file
+                st.download_button(
+                    label="Download Links as CSV",
+                    data=open(csv_file.name, "rb").read(),
+                    file_name="scraped_links.csv",
+                    mime="text/csv",
+                )
+            else:
+                st.write('Invalid scrape option. Please choose "data" or "links".')
+        else:
+            st.write(f'Error: {response.status}')
+    except Exception as e:
+        st.write(f'An error occurred: {e}')
+# Streamlit UI
+st.title("Web Scraping Tool")
+website_url = st.text_input("Enter the URL to scrape:")
+scrape_option = st.selectbox("Select what to scrape:", ['data', 'links'])
+if st.button("Scrape"):
+    simple_web_scraper(website_url, scrape_option)