Spaces:
Build error
Build error
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| def scrape_visible_text_from_url(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Remove script, style, and other non-visible tags | |
| for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]): | |
| tag.extract() | |
| # Get the header content | |
| header_content = soup.find("header") | |
| header_text = header_content.get_text() if header_content else "" | |
| # Get the paragraph content | |
| paragraph_content = soup.find_all("p") | |
| paragraph_text = " ".join([p.get_text() for p in paragraph_content]) | |
| # Combine header and paragraph text | |
| visible_text = f"{header_text}\n\n{paragraph_text}" | |
| # Remove multiple whitespaces and newlines | |
| visible_text = re.sub(r'\s+', ' ', visible_text) | |
| return visible_text.strip() | |
| except Exception as e: | |
| return f"Error occurred while scraping the data: {e}" | |
| def scrape_and_display(url): | |
| if url: | |
| data = scrape_visible_text_from_url(url) | |
| if data: | |
| return data | |
| else: | |
| return "Failed to scrape visible text from the URL." | |
| else: | |
| return "Please enter a valid URL." | |
| # Define the Gradio interface | |
| iface = gr.Interface( | |
| fn=scrape_and_display, | |
| inputs=gr.Textbox(label="Enter the URL of the web page:"), | |
| outputs=gr.Textbox(label="Scraped Text:"), | |
| title="Web Data Scraper", | |
| description="Enter a URL to scrape visible text from the web page.", | |
| theme="huggingface" | |
| ) | |
| # Launch the Gradio app | |
| if __name__ == "__main__": | |
| iface.launch() |