| import gradio as gr |
| import urllib3 |
| from bs4 import BeautifulSoup |
| from urllib.parse import urljoin |
|
|
| |
| class CustomSession: |
| def __init__(self): |
| self.pool_manager = urllib3.PoolManager() |
|
|
| def get(self, url): |
| response = self.pool_manager.request('GET', url) |
| return CustomResponse(response) |
|
|
| class CustomResponse: |
| def __init__(self, response): |
| self.status_code = response.status |
| self.headers = response.headers |
| self.content = response.data |
|
|
| def soup(self): |
| return BeautifulSoup(self.content, 'lxml') |
|
|
| def get(url): |
| session = CustomSession() |
| return session.get(url) |
|
|
| |
| def extract_texts(soup): |
| """Extracts all text content from the soup.""" |
| return [text for text in soup.stripped_strings] |
|
|
| def extract_links(soup, base_url): |
| """Extracts all valid links from the soup.""" |
| links = [] |
| for link in soup.find_all('a', href=True): |
| href = link['href'] |
| full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href |
| link_text = link.get_text(strip=True) or "No Text" |
| links.append({"Text": link_text, "URL": full_url}) |
| return links |
|
|
| def extract_images(soup, base_url): |
| """Extracts all valid image URLs and their alt text from the soup.""" |
| images = [] |
| for img in soup.find_all('img', src=True): |
| img_url = img['src'] |
| full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url |
| alt_text = img.get('alt', 'No Alt Text') |
| images.append({"Alt Text": alt_text, "Image URL": full_img_url}) |
| return images |
|
|
| def format_detailed_output(structured_data): |
| """Formats the structured data into a Markdown string.""" |
| result = "### Structured Page Content\n\n" |
| result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n" |
| result += "**Links:**\n" |
| if structured_data["Links"]: |
| result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n" |
| else: |
| result += "No links found.\n" |
| result += "**Images:**\n" |
| if structured_data["Images"]: |
| result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n" |
| else: |
| result += "No images found.\n" |
| return result |
|
|
| |
| def download_and_process_web_page(url): |
| """Downloads a web page and returns the structured content.""" |
| if not url.startswith("http://") and not url.startswith("https://"): |
| url = "http://" + url |
|
|
| try: |
| response = get(url) |
| soup = response.soup() |
| structured_data = { |
| "Texts": extract_texts(soup), |
| "Links": extract_links(soup, url), |
| "Images": extract_images(soup, url) |
| } |
| return format_detailed_output(structured_data) |
|
|
| except urllib3.exceptions.HTTPError as e: |
| return f"Error: {e}" |
| except Exception as e: |
| return f"Error processing web page: {e}" |
|
|
| |
| iface = gr.Interface( |
| fn=download_and_process_web_page, |
| inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"), |
| outputs=gr.Markdown(label="Web Page Content"), |
| title="Web Page Processor for Hugging Face Chat Tools", |
| description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools." |
| ) |
|
|
| |
| iface.launch() |