import gradio as gr
import requests
from bs4 import BeautifulSoup
import re

def scrape_visible_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove script, style, and other non-visible tags
        for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
            tag.extract()

        # Get the header content
        header_content = soup.find("header")
        header_text = header_content.get_text() if header_content else ""

        # Get the paragraph content
        paragraph_content = soup.find_all("p")
        paragraph_text = " ".join([p.get_text() for p in paragraph_content])

        # Combine header and paragraph text
        visible_text = f"{header_text}\n\n{paragraph_text}"

        # Remove multiple whitespaces and newlines
        visible_text = re.sub(r'\s+', ' ', visible_text)
        return visible_text.strip()
    except Exception as e:
        return f"Error occurred while scraping the data: {e}"

def scrape_and_display(url):
    if url:
        data = scrape_visible_text_from_url(url)
        if data:
            return data
        else:
            return "Failed to scrape visible text from the URL."
    else:
        return "Please enter a valid URL."

# Define the Gradio interface
iface = gr.Interface(
    fn=scrape_and_display,
    inputs=gr.Textbox(label="Enter the URL of the web page:"),
    outputs=gr.Textbox(label="Scraped Text:"),
    title="Web Data Scraper",
    description="Enter a URL to scrape visible text from the web page.",
    theme="huggingface"
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch()