Spaces:
No application file
No application file
| Files: 2 files loaded | |
| === app.py === | |
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| # Function to scrape only visible text from the given URL | |
| def scrape_visible_text_from_url(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Remove script, style, and other non-visible tags | |
| for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]): | |
| tag.extract() | |
| # Get the header content | |
| header_content = soup.find("header") | |
| header_text = header_content.get_text() if header_content else "" | |
| # Get the paragraph content | |
| paragraph_content = soup.find_all("p") | |
| paragraph_text = " ".join([p.get_text() for p in paragraph_content]) | |
| # Combine header and paragraph text | |
| visible_text = f"{header_text}\n\n{paragraph_text}" | |
| # Remove multiple whitespaces and newlines | |
| visible_text = re.sub(r'\s+', ' ', visible_text) | |
| return visible_text.strip() | |
| except Exception as e: | |
| st.error(f"Error occurred while scraping the data: {e}") | |
| return None | |
| # Streamlit UI | |
| def main(): | |
| st.title("Web Data Scraper") | |
| # Get the URL from the user | |
| url_input = st.text_input("Enter the URL of the web page:", "") | |
| if st.button("Scrape Visible Text"): | |
| if url_input: | |
| # Extract visible text from the URL | |
| data = scrape_visible_text_from_url(url_input) | |
| if data: | |
| st.success("Visible text successfully scraped!") | |
| st.subheader("Scraped Text:") | |
| st.write(data) | |
| else: | |
| st.warning("Failed to scrape visible text from the URL.") | |
| else: | |
| st.warning("Please enter a valid URL.") | |
| if __name__ == "__main__": | |
| main() | |
| === requirements.txt === | |
| aiohttp==3.8.5 | |
| aiosignal==1.3.1 | |
| altair==5.0.1 | |
| async-timeout==4.0.2 | |
| attrs==23.1.0 | |
| beautifulsoup4==4.12.2 | |
| blinker==1.6.2 | |
| bs4==0.0.1 | |
| cachetools==5.3.1 | |
| certifi==2023.7.22 | |
| charset-normalizer==3.2.0 | |
| click==8.1.6 | |
| decorator==5.1.1 | |
| frozenlist==1.4.0 | |
| gitdb==4.0.10 | |
| GitPython==3.1.32 | |
| idna==3.4 | |
| importlib-metadata==6.8.0 | |
| Jinja2==3.1.2 | |
| jsonschema==4.18.4 | |
| jsonschema-specifications==2023.7.1 | |
| markdown-it-py==3.0.0 | |
| MarkupSafe==2.1.3 | |
| mdurl==0.1.2 | |
| multidict==6.0.4 | |
| numpy==1.25.2 | |
| openai==0.27.8 | |
| packaging==23.1 | |
| pandas==2.0.3 | |
| Pillow==9.5.0 | |
| protobuf==4.23.4 | |
| pyarrow==12.0.1 | |
| pydeck==0.8.0 | |
| Pygments==2.15.1 | |
| Pympler==1.0.1 | |
| python-dateutil==2.8.2 | |
| python-dotenv==1.0.0 | |
| pytz==2023.3 | |
| pytz-deprecation-shim==0.1.0.post0 | |
| referencing==0.30.0 | |
| requests==2.31.0 | |
| rich==13.5.2 | |
| rpds-py==0.9.2 | |
| six==1.16.0 | |
| smmap==5.0.0 | |
| soupsieve==2.4.1 | |
| streamlit==1.25.0 | |
| tenacity==8.2.2 | |
| toml==0.10.2 | |
| toolz==0.12.0 | |
| tornado==6.3.2 | |
| tqdm==4.65.0 | |
| typing_extensions==4.7.1 | |
| tzdata==2023.3 | |
| tzlocal==4.3.1 | |
| urllib3==2.0.4 | |
| validators==0.20.0 | |
| watchdog==3.0.0 | |
| yarl==1.9.2 | |
| zipp==3.16.2 |