Update app.py
Browse files
app.py
CHANGED
|
@@ -40,16 +40,6 @@ def extract_links(soup, base_url):
|
|
| 40 |
links.append({"Text": link_text, "URL": full_url})
|
| 41 |
return links
|
| 42 |
|
| 43 |
-
def extract_links(soup, base_url):
|
| 44 |
-
"""Extracts all valid links from the soup."""
|
| 45 |
-
links = []
|
| 46 |
-
for link in soup.find_all('a', href=True):
|
| 47 |
-
href = link['href']
|
| 48 |
-
full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
|
| 49 |
-
link_text = link.get_text(strip=True) or "No Text"
|
| 50 |
-
links.append({"Text": link_text, "URL": full_url})
|
| 51 |
-
return links
|
| 52 |
-
|
| 53 |
def extract_images(soup, base_url):
|
| 54 |
"""Extracts all valid image URLs and their alt text from the soup."""
|
| 55 |
images = []
|
|
@@ -60,14 +50,9 @@ def extract_images(soup, base_url):
|
|
| 60 |
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
|
| 61 |
return images
|
| 62 |
|
| 63 |
-
def extract_page_title(soup):
|
| 64 |
-
"""Extracts the page title from the soup."""
|
| 65 |
-
title_tag = soup.find('title')
|
| 66 |
-
return title_tag.get_text(strip=True) if title_tag else "No Title Found"
|
| 67 |
-
|
| 68 |
def format_detailed_output(structured_data):
|
| 69 |
"""Formats the structured data into a Markdown string."""
|
| 70 |
-
result =
|
| 71 |
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
| 72 |
result += "**Links:**\n"
|
| 73 |
if structured_data["Links"]:
|
|
@@ -91,7 +76,6 @@ def download_and_process_web_page(url):
|
|
| 91 |
response = get(url)
|
| 92 |
soup = response.soup()
|
| 93 |
structured_data = {
|
| 94 |
-
"Page Title": extract_page_title(soup),
|
| 95 |
"Texts": extract_texts(soup),
|
| 96 |
"Links": extract_links(soup, url),
|
| 97 |
"Images": extract_images(soup, url)
|
|
@@ -109,7 +93,7 @@ iface = gr.Interface(
|
|
| 109 |
inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
|
| 110 |
outputs=gr.Markdown(label="Web Page Content"),
|
| 111 |
title="Web Page Processor for Hugging Face Chat Tools",
|
| 112 |
-
description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including
|
| 113 |
share=False # Set share to False to remove the "Share via link" button
|
| 114 |
)
|
| 115 |
|
|
|
|
| 40 |
links.append({"Text": link_text, "URL": full_url})
|
| 41 |
return links
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
def extract_images(soup, base_url):
|
| 44 |
"""Extracts all valid image URLs and their alt text from the soup."""
|
| 45 |
images = []
|
|
|
|
| 50 |
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
|
| 51 |
return images
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
def format_detailed_output(structured_data):
|
| 54 |
"""Formats the structured data into a Markdown string."""
|
| 55 |
+
result = "### Structured Page Content\n\n"
|
| 56 |
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
| 57 |
result += "**Links:**\n"
|
| 58 |
if structured_data["Links"]:
|
|
|
|
| 76 |
response = get(url)
|
| 77 |
soup = response.soup()
|
| 78 |
structured_data = {
|
|
|
|
| 79 |
"Texts": extract_texts(soup),
|
| 80 |
"Links": extract_links(soup, url),
|
| 81 |
"Images": extract_images(soup, url)
|
|
|
|
| 93 |
inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
|
| 94 |
outputs=gr.Markdown(label="Web Page Content"),
|
| 95 |
title="Web Page Processor for Hugging Face Chat Tools",
|
| 96 |
+
description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools.",
|
| 97 |
share=False # Set share to False to remove the "Share via link" button
|
| 98 |
)
|
| 99 |
|