Spaces:

play7284
/

web

Sleeping

App Files Files Community

play7284 commited on Oct 17, 2025

Commit

1a64d59

verified ·

1 Parent(s): 915fd5d

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -163

app.py CHANGED Viewed

@@ -1,163 +1,150 @@
-  import gradio as gr
-  import httpx
-  import readabilipy
-  import markdownify
-  import asyncio
-  # 使用与原始服务器中用户发起请求时相同的 User-Agent
-  # 这有助于模拟常规浏览器行为，减少被网站屏蔽的概率
-  USER_AGENT = "ModelContextProtocol/1.0 (User-Specified;
-  +https://github.com/modelcontextprotocol/servers)"
-  async def fetch(url: str, force_raw: bool) -> str:
-      """
-      异步获取并处理给定 URL 的内容。
-      Args:
-          url: 要抓取的网页地址。
-          force_raw: 如果为 True，则返回原始 HTML/文本，不进行 Markdown 转换。
-      Returns:
-          处理后的内容字符串。
-      """
-      if not url or not url.strip():
-          return "错误：URL 不能为空。"
-      # 为了方便用户，如果 URL 缺少协议头，则自动添加 https://
-      if not url.startswith(('http://', 'https://')):
-          url = 'https://' + url
-      try:
-          # 使用 httpx 库发起异步 HTTP 请求，设置
-  User-Agent、允许重定向并设置30秒超时
-          async with httpx.AsyncClient(headers={"User-Agent": USER_AGENT},
-  follow_redirects=True, timeout=30.0) as client:
-              print(f"正在请求: {url}...")
-              response = await client.get(url)
-              # 检查请求是否成功，如果不成功（例如404或500错误），则会抛出异常
-              response.raise_for_status()
-              print(f"收到响应，状态码: {response.status_code}")
-              page_raw = response.text
-              content_type = response.headers.get("content-type", "").lower()
-              # 判断内容是否为 HTML
-              # 通过检查 Content-Type 或内容开头是否包含 <html> 标签来判断
-              is_page_html = "text/html" in content_type or
-  page_raw.strip().lower().startswith("<html")
-              if is_page_html and not force_raw:
-                  # 1. 使用 readabilipy 从原始 HTML 中提取核心文章内容
-                  # 这可以有效去除广告、导航栏、页脚等无关元素
-                  article =
-  readabilipy.simple_json.simple_json_from_html_string(page_raw,
-  use_readability=True)
-                  if not article or not article.get("content"):
-                      return f"##
-  无法解析页面\n\n未能成功提取页面主要内容。以下是原始
-  HTML：\n\n```html\n{page_raw}\n```"
-                  # 2. 将清理后的 HTML 转换为 Markdown 格式，使其更易于阅读
-                  html_content = article["content"]
-                  markdown_content = markdownify.markdownify(html_content,
-  heading_style=markdownify.ATX)
-                  title = article.get('title', '无标题')
-                  return f"# {title}\n\n{markdown_content}"
-              else:
-                  # 如果内容不是 HTML 或用户选择了"原始模式"，则直接返回原始内容
-                  # 使用代码块包裹，以便更好地显示
-                  prefix = f"已获取原始网页内容 (Content-Type:
-  {content_type}):\n\n"
-                  lang = "html" if is_page_html else "text"
-                  return f"{prefix}```{lang}\n{page_raw}\n```"
-      except httpx.RequestError as e:
-          return f"访问 URL 时发生网络错误: {url}\n\n详细信息: {e}"
-      except Exception as e:
-          return f"发生未知错误: {e}"
-  def copy_to_clipboard(content: str) -> str:
-      """
-      将内容复制到剪贴板
-      """
-      try:
-          import pyperclip
-          pyperclip.copy(content)
-          return "✅ 内容已复制到剪贴板！"
-      except ImportError:
-          return "❌ 无法复制：请安装 pyperclip 库 (pip install pyperclip)"
-      except Exception as e:
-          return f"❌ 复制失败：{str(e)}"
-  # 使用 gr.Blocks() 创建 Gradio 界面，可以更自由地布局
-  with gr.Blocks(theme=gr.themes.Soft(), title="网页内容提取工具") as demo:
-      gr.Markdown("# 网页内容提取工具 (Fetch)")
-      gr.Markdown(
-          "输入一个网址，此工具可以提取其主要内容并将其转换为干净的 Markdown
-  格式。"
-          "非常适合用于阅读文章，可以去除广告和导航等干扰元素。"
-      )
-      with gr.Row():
-          url_input = gr.Textbox(
-              label="输入网址",
-              placeholder="例如:
-  en.wikipedia.org/wiki/Python_(programming_language)",
-              scale=4  # 使输入框更宽
-          )
-          raw_checkbox = gr.Checkbox(
-              label="获取原始 HTML",
-              info="如果勾选，将返回未经处理的完整网页 HTML 代码。",
-              scale=1
-          )
-      with gr.Row():
-          submit_btn = gr.Button("提取内容", variant="primary")
-          copy_btn = gr.Button("📋 复制内容", variant="secondary")
-      output_markdown = gr.Markdown(label="提取结果")
-      copy_status = gr.Textbox(label="复制状态", interactive=False, visible=False)
-      # 提供一些示例，方便用户快速体验
-      gr.Examples(
-          examples=[
-              ["https://modelcontextprotocol.io/", False],
-              ["https://www.gradio.app/guides/quickstart", False],
-              ["https://www.anthropic.com/news/claude-3-5-sonnet", False],
-          ],
-          inputs=[url_input, raw_checkbox],
-          outputs=output_markdown,
-          fn=fetch,
-          cache_examples=False, # 禁用缓存，确保每次都获取最新网页内容
-      )
-      # 将按钮的点击事件与核心处理函数绑定
-      submit_btn.click(
-          fn=fetch,
-          inputs=[url_input, raw_checkbox],
-          outputs=output_markdown,
-          api_name="fetch_content" # 为 API 模式命名
-      )
-      # 复制按钮点击事件
-      copy_btn.click(
-          fn=copy_to_clipboard,
-          inputs=[output_markdown],
-          outputs=[copy_status]
-      ).then(
-          lambda: gr.update(visible=True),
-          outputs=[copy_status]
-      ).then(
-          lambda: gr.update(visible=False),
-          inputs=None,
-          outputs=[copy_status],
-          _js="(x) => new Promise((resolve) => setTimeout(() => resolve(), 2000))"
-      )
-  if __name__ == "__main__":
-      # 启动 Gradio 应用
-      demo.launch(mcp_server=True)

+import gradio as gr
+import httpx
+import readabilipy
+import markdownify
+import asyncio
+# 使用与原始服务器中用户发起请求时相同的 User-Agent
+# 这有助于模拟常规浏览器行为，减少被网站屏蔽的概率
+USER_AGENT = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
+async def fetch(url: str, force_raw: bool) -> str:
+    """
+    异步获取并处理给定 URL 的内容。
+    Args:
+        url: 要抓取的网页地址。
+        force_raw: 如果为 True，则返回原始 HTML/文本，不进行 Markdown 转换。
+    Returns:
+        处理后的内容字符串。
+    """
+    if not url or not url.strip():
+        return "错误：URL 不能为空。"
+    # 为了方便用户，如果 URL 缺少协议头，则自动添加 https://
+    if not url.startswith(('http://', 'https://')):
+        url = 'https://' + url
+    try:
+        # 使用 httpx 库发起异步 HTTP 请求，设置 User-Agent、允许重定向并设置30秒超时
+        async with httpx.AsyncClient(headers={"User-Agent": USER_AGENT}, follow_redirects=True, timeout=30.0) as client:
+            print(f"正在请求: {url}...")
+            response = await client.get(url)
+            # 检查请求是否成功，如果不成功（例如404或500错误），则会抛出异常
+            response.raise_for_status()
+            print(f"收到响应，状态码: {response.status_code}")
+            page_raw = response.text
+            content_type = response.headers.get("content-type", "").lower()
+            # 判断内容是否为 HTML
+            # 通过检查 Content-Type 或内容开头是否包含 <html> 标签来判断
+            is_page_html = "text/html" in content_type or page_raw.strip().lower().startswith("<html")
+            if is_page_html and not force_raw:
+                # 1. 使用 readabilipy 从原始 HTML 中提取核心文章内容
+                # 这可以有效去除广告、导航栏、页脚等无关元素
+                article = readabilipy.simple_json.simple_json_from_html_string(page_raw, use_readability=True)
+                if not article or not article.get("content"):
+                    return f"## 无法解析页面\n\n未能成功提取页面主要内容。以下是原始 HTML：\n\n```html\n{page_raw}\n```"
+                # 2. 将清理后的 HTML 转换为 Markdown 格式，使其更易于阅读
+                html_content = article["content"]
+                markdown_content = markdownify.markdownify(html_content, heading_style=markdownify.ATX)
+                title = article.get('title', '无标题')
+                return f"# {title}\n\n{markdown_content}"
+            else:
+                # 如果内容不是 HTML 或用户选择了"原始模式"，则直接返回原始内容
+                # 使用代码块包裹，以便更好地显示
+                prefix = f"已获取原始网页内容 (Content-Type: {content_type}):\n\n"
+                lang = "html" if is_page_html else "text"
+                return f"{prefix}```{lang}\n{page_raw}\n```"
+    except httpx.RequestError as e:
+        return f"访问 URL 时发生网络错误: {url}\n\n详细信息: {e}"
+    except Exception as e:
+        return f"发生未知错误: {e}"
+def copy_to_clipboard(content: str) -> str:
+    """
+    将内容复制到剪贴板
+    """
+    try:
+        import pyperclip
+        pyperclip.copy(content)
+        return "✅ 内容已复制到剪贴板！"
+    except ImportError:
+        return "❌ 无法复制：请安装 pyperclip 库 (pip install pyperclip)"
+    except Exception as e:
+        return f"❌ 复制失败：{str(e)}"
+# 使用 gr.Blocks() 创建 Gradio 界面，可以更自由地布局
+with gr.Blocks(theme=gr.themes.Soft(), title="网页内容提取工具") as demo:
+    gr.Markdown("# 网页内容提取工具 (Fetch)")
+    gr.Markdown(
+        "输入一个网址，此工具可以提取其主要内容并将其转换为干净的 Markdown 格式。"
+        "非常适合用于阅读文章，可以去除广告和导航等干扰元素。"
+    )
+    with gr.Row():
+        url_input = gr.Textbox(
+            label="输入网址",
+            placeholder="例如: en.wikipedia.org/wiki/Python_(programming_language)",
+            scale=4  # 使输入框更宽
+        )
+        raw_checkbox = gr.Checkbox(
+            label="获取原始 HTML",
+            info="如果勾选，将返回未经处理的完整网页 HTML 代码。",
+            scale=1
+        )
+    with gr.Row():
+        submit_btn = gr.Button("提取内容", variant="primary")
+        copy_btn = gr.Button("📋 复制内容", variant="secondary")
+    output_markdown = gr.Markdown(label="提取结果")
+    copy_status = gr.Textbox(label="复制状态", interactive=False, visible=False)
+    # 提供一些示例，方便用户快速体验
+    gr.Examples(
+        examples=[
+            ["https://modelcontextprotocol.io/", False],
+            ["https://www.gradio.app/guides/quickstart", False],
+            ["https://www.anthropic.com/news/claude-3-5-sonnet", False],
+        ],
+        inputs=[url_input, raw_checkbox],
+        outputs=output_markdown,
+        fn=fetch,
+        cache_examples=False, # 禁用缓存，确保每次都获取最新网页内容
+    )
+    # 将按钮的点击事件与核心处理函数绑定
+    submit_btn.click(
+        fn=fetch,
+        inputs=[url_input, raw_checkbox],
+        outputs=output_markdown,
+        api_name="fetch_content" # 为 API 模式命名
+    )
+    # 复制按钮点击事件
+    copy_btn.click(
+        fn=copy_to_clipboard,
+        inputs=[output_markdown],
+        outputs=[copy_status]
+    ).then(
+        lambda: gr.update(visible=True),
+        outputs=[copy_status]
+    ).then(
+        lambda: gr.update(visible=False),
+        inputs=None,
+        outputs=[copy_status],
+        _js="(x) => new Promise((resolve) => setTimeout(() => resolve(), 2000))"
+    )
+if __name__ == "__main__":
+    # 启动 Gradio 应用
+    demo.launch(mcp_server=True)