Spaces:

KSh100
/

websearch

Build error

App Files Files Community

KSh100 commited on Apr 11, 2025

Commit

8ac60f7

verified ·

1 Parent(s): bcd351e

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -18

app.py CHANGED Viewed

@@ -40,16 +40,6 @@ def extract_links(soup, base_url):
         links.append({"Text": link_text, "URL": full_url})
     return links
-def extract_links(soup, base_url):
-    """Extracts all valid links from the soup."""
-    links = []
-    for link in soup.find_all('a', href=True):
-        href = link['href']
-        full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
-        link_text = link.get_text(strip=True) or "No Text"
-        links.append({"Text": link_text, "URL": full_url})
-    return links
 def extract_images(soup, base_url):
     """Extracts all valid image URLs and their alt text from the soup."""
     images = []
@@ -60,14 +50,9 @@ def extract_images(soup, base_url):
         images.append({"Alt Text": alt_text, "Image URL": full_img_url})
     return images
-def extract_page_title(soup):
-    """Extracts the page title from the soup."""
-    title_tag = soup.find('title')
-    return title_tag.get_text(strip=True) if title_tag else "No Title Found"
 def format_detailed_output(structured_data):
     """Formats the structured data into a Markdown string."""
-    result = f"### Page Title: {structured_data['Page Title']}\n\n"
     result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
     result += "**Links:**\n"
     if structured_data["Links"]:
@@ -91,7 +76,6 @@ def download_and_process_web_page(url):
         response = get(url)
         soup = response.soup()
         structured_data = {
-            "Page Title": extract_page_title(soup),
             "Texts": extract_texts(soup),
             "Links": extract_links(soup, url),
             "Images": extract_images(soup, url)
@@ -109,7 +93,7 @@ iface = gr.Interface(
     inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
     outputs=gr.Markdown(label="Web Page Content"),
     title="Web Page Processor for Hugging Face Chat Tools",
-    description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including the title, text, links, and images. This tool is designed for use with Hugging Face Chat Tools.",
     share=False  # Set share to False to remove the "Share via link" button
 )

         links.append({"Text": link_text, "URL": full_url})
     return links
 def extract_images(soup, base_url):
     """Extracts all valid image URLs and their alt text from the soup."""
     images = []
         images.append({"Alt Text": alt_text, "Image URL": full_img_url})
     return images
 def format_detailed_output(structured_data):
     """Formats the structured data into a Markdown string."""
+    result = "### Structured Page Content\n\n"
     result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
     result += "**Links:**\n"
     if structured_data["Links"]:
         response = get(url)
         soup = response.soup()
         structured_data = {
             "Texts": extract_texts(soup),
             "Links": extract_links(soup, url),
             "Images": extract_images(soup, url)
     inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
     outputs=gr.Markdown(label="Web Page Content"),
     title="Web Page Processor for Hugging Face Chat Tools",
+    description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools.",
     share=False  # Set share to False to remove the "Share via link" button
 )