import gradio as gr import requests from bs4 import BeautifulSoup import re def scrape_visible_text_from_url(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Remove script, style, and other non-visible tags for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]): tag.extract() # Get the header content header_content = soup.find("header") header_text = header_content.get_text() if header_content else "" # Get the paragraph content paragraph_content = soup.find_all("p") paragraph_text = " ".join([p.get_text() for p in paragraph_content]) # Combine header and paragraph text visible_text = f"{header_text}\n\n{paragraph_text}" # Remove multiple whitespaces and newlines visible_text = re.sub(r'\s+', ' ', visible_text) return visible_text.strip() except Exception as e: return f"Error occurred while scraping the data: {e}" def scrape_and_display(url): if url: data = scrape_visible_text_from_url(url) if data: return data else: return "Failed to scrape visible text from the URL." else: return "Please enter a valid URL." # Define the Gradio interface iface = gr.Interface( fn=scrape_and_display, inputs=gr.Textbox(label="Enter the URL of the web page:"), outputs=gr.Textbox(label="Scraped Text:"), title="Web Data Scraper", description="Enter a URL to scrape visible text from the web page.", theme="huggingface" ) # Launch the Gradio app if __name__ == "__main__": iface.launch()