bot-bqf6pz2f / src /streamlit_app.py
bep40's picture
Upload src/streamlit_app.py with huggingface_hub
600deca verified
Files: 2 files loaded
=== app.py ===
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
# Function to scrape only visible text from the given URL
def scrape_visible_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script, style, and other non-visible tags
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
tag.extract()
# Get the header content
header_content = soup.find("header")
header_text = header_content.get_text() if header_content else ""
# Get the paragraph content
paragraph_content = soup.find_all("p")
paragraph_text = " ".join([p.get_text() for p in paragraph_content])
# Combine header and paragraph text
visible_text = f"{header_text}\n\n{paragraph_text}"
# Remove multiple whitespaces and newlines
visible_text = re.sub(r'\s+', ' ', visible_text)
return visible_text.strip()
except Exception as e:
st.error(f"Error occurred while scraping the data: {e}")
return None
# Streamlit UI
def main():
st.title("Web Data Scraper")
# Get the URL from the user
url_input = st.text_input("Enter the URL of the web page:", "")
if st.button("Scrape Visible Text"):
if url_input:
# Extract visible text from the URL
data = scrape_visible_text_from_url(url_input)
if data:
st.success("Visible text successfully scraped!")
st.subheader("Scraped Text:")
st.write(data)
else:
st.warning("Failed to scrape visible text from the URL.")
else:
st.warning("Please enter a valid URL.")
if __name__ == "__main__":
main()
=== requirements.txt ===
aiohttp==3.8.5
aiosignal==1.3.1
altair==5.0.1
async-timeout==4.0.2
attrs==23.1.0
beautifulsoup4==4.12.2
blinker==1.6.2
bs4==0.0.1
cachetools==5.3.1
certifi==2023.7.22
charset-normalizer==3.2.0
click==8.1.6
decorator==5.1.1
frozenlist==1.4.0
gitdb==4.0.10
GitPython==3.1.32
idna==3.4
importlib-metadata==6.8.0
Jinja2==3.1.2
jsonschema==4.18.4
jsonschema-specifications==2023.7.1
markdown-it-py==3.0.0
MarkupSafe==2.1.3
mdurl==0.1.2
multidict==6.0.4
numpy==1.25.2
openai==0.27.8
packaging==23.1
pandas==2.0.3
Pillow==9.5.0
protobuf==4.23.4
pyarrow==12.0.1
pydeck==0.8.0
Pygments==2.15.1
Pympler==1.0.1
python-dateutil==2.8.2
python-dotenv==1.0.0
pytz==2023.3
pytz-deprecation-shim==0.1.0.post0
referencing==0.30.0
requests==2.31.0
rich==13.5.2
rpds-py==0.9.2
six==1.16.0
smmap==5.0.0
soupsieve==2.4.1
streamlit==1.25.0
tenacity==8.2.2
toml==0.10.2
toolz==0.12.0
tornado==6.3.2
tqdm==4.65.0
typing_extensions==4.7.1
tzdata==2023.3
tzlocal==4.3.1
urllib3==2.0.4
validators==0.20.0
watchdog==3.0.0
yarl==1.9.2
zipp==3.16.2