text_tiger / app.py
guifav's picture
Updated app to use Gradio instead of Streamlit
302b997
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
def scrape_visible_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script, style, and other non-visible tags
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
tag.extract()
# Get the header content
header_content = soup.find("header")
header_text = header_content.get_text() if header_content else ""
# Get the paragraph content
paragraph_content = soup.find_all("p")
paragraph_text = " ".join([p.get_text() for p in paragraph_content])
# Combine header and paragraph text
visible_text = f"{header_text}\n\n{paragraph_text}"
# Remove multiple whitespaces and newlines
visible_text = re.sub(r'\s+', ' ', visible_text)
return visible_text.strip()
except Exception as e:
return f"Error occurred while scraping the data: {e}"
def scrape_and_display(url):
if url:
data = scrape_visible_text_from_url(url)
if data:
return data
else:
return "Failed to scrape visible text from the URL."
else:
return "Please enter a valid URL."
# Define the Gradio interface
iface = gr.Interface(
fn=scrape_and_display,
inputs=gr.Textbox(label="Enter the URL of the web page:"),
outputs=gr.Textbox(label="Scraped Text:"),
title="Web Data Scraper",
description="Enter a URL to scrape visible text from the web page.",
theme="huggingface"
)
# Launch the Gradio app
if __name__ == "__main__":
iface.launch()