Spaces:

wkplhc
/

url

Paused

App Files Files Community

wkplhc commited on Sep 8, 2025

Commit

08dd4ad

verified ·

1 Parent(s): 17ca1df

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -87

app.py CHANGED Viewed

@@ -3,6 +3,10 @@ import requests
 from bs4 import BeautifulSoup
 import re
 from urllib.parse import urljoin, urlparse
 def is_valid_url(url):
     """检查URL是否有效"""
@@ -13,133 +17,148 @@ def is_valid_url(url):
         return False
 def extract_video_urls(url):
-    """从给定URL提取视频地址"""
     if not is_valid_url(url):
-        return "❌ 无效的URL，请检查格式是否正确（需包含http://或https://）"
     try:
         # 设置请求头，模拟浏览器访问
         headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         }
         # 发送请求获取网页内容
         response = requests.get(url, headers=headers, timeout=10)
-        response.raise_for_status()  # 抛出HTTP错误
-        # 解析HTML
-        soup = BeautifulSoup(response.text, 'html.parser')
         # 存储提取到的视频URL
-        video_urls = []
-        # 1. 从video标签提取
-        video_tags = soup.find_all('video')
-        for tag in video_tags:
-            if 'src' in tag.attrs:
-                video_url = urljoin(url, tag['src'])
-                video_urls.append(f"🎬 视频标签: {video_url}")
-        # 2. 从source标签提取
-        source_tags = soup.find_all('source')
-        for tag in source_tags:
-            if 'src' in tag.attrs:
-                video_url = urljoin(url, tag['src'])
-                video_urls.append(f"📽️ 源标签: {video_url}")
-        # 3. 从iframe标签提取可能包含视频的链接
-        iframe_tags = soup.find_all('iframe')
-        for tag in iframe_tags:
-            if 'src' in tag.attrs:
-                iframe_url = urljoin(url, tag['src'])
-                video_urls.append(f"🔗 嵌入框架: {iframe_url}")
-        # 4. 搜索可能的视频URL模式
-        video_patterns = [
-            r'https?://[^"\']+\.(mp4|webm|mov|avi|flv|mkv)',
-            r'src=[\'"](https?://[^"\']+\.(mp4|webm|mov|avi|flv|mkv))[\'"]'
-        ]
-        for pattern in video_patterns:
-            matches = re.findall(pattern, response.text)
-            for match in matches:
-                video_url = match[0] if isinstance(match, tuple) else match
-                if video_url not in [u.split(": ", 1)[1] for u in video_urls]:
-                    video_urls.append(f"🔍 检测到视频: {video_url}")
-        # 去重处理
-        unique_urls = []
-        seen = set()
-        for url_entry in video_urls:
-            url_part = url_entry.split(": ", 1)[1]
-            if url_part not in seen:
-                seen.add(url_part)
-                unique_urls.append(url_entry)
-        if not unique_urls:
-            return "ℹ️ 未在该网页中找到视频地址"
         else:
-            return "\n\n".join(unique_urls)
     except requests.exceptions.Timeout:
-        return "⏱️ 请求超时，请稍后再试"
-    except requests.exceptions.HTTPError as e:
-        return f"❌ HTTP错误: {str(e)}"
     except requests.exceptions.RequestException as e:
         return f"❌ 请求失败: {str(e)}"
     except Exception as e:
-        return f"❌ 解析错误: {str(e)}"
-# 创建Gradio界面
-with gr.Blocks(title="视频地址提取工具", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🎥 视频地址提取工具
-    输入包含视频的网页URL，提取该页面中所有视频的真实地址。
-    使用说明:
-    1. 输入完整的网页URL（需包含http://或https://）
-    2. 点击"提取视频地址"按钮
-    3. 等待解析完成后查看结果
-    """)
     with gr.Row():
-        url_input = gr.Textbox(
-            label="网页URL",
-            placeholder="例如: https://example.com/video-page",
-            lines=1,
-            container=True
-        )
-        extract_btn = gr.Button("提取视频地址", variant="primary")
     result_output = gr.Textbox(
         label="提取结果",
         lines=10,
-        container=True
     )
-    # 设置按钮点击事件
     extract_btn.click(
-        fn=extract_video_urls,
-        inputs=url_input,
-        outputs=result_output
     )
-    # 设置回车键触发提取
     url_input.submit(
-        fn=extract_video_urls,
-        inputs=url_input,
-        outputs=result_output
     )
-    gr.Markdown("""
-    ⚠️ 注意:
-    - 部分网站可能因防盗链或权限限制无法提取视频
-    - 提取结果仅供学习研究使用
-    - 大型网页可能需要较长解析时间
-    """)
 # 启动应用
 if __name__ == "__main__":
     demo.launch()

 from bs4 import BeautifulSoup
 import re
 from urllib.parse import urljoin, urlparse
+import time
+# 支持的视频格式
+SUPPORTED_FORMATS = {'.mp4', '.webm', '.mov', '.avi', '.mkv', '.flv', '.wmv', '.mpeg', '.mpg'}
 def is_valid_url(url):
     """检查URL是否有效"""
         return False
 def extract_video_urls(url):
+    """从给定URL提取所有视频地址"""
     if not is_valid_url(url):
+        return "❌ 无效的URL，请确保包含http://或https://"
     try:
         # 设置请求头，模拟浏览器访问
         headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
         # 发送请求获取网页内容
         response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()  # 检查请求是否成功
+        html_content = response.text
+        # 使用BeautifulSoup解析HTML
+        soup = BeautifulSoup(html_content, 'html.parser')
         # 存储提取到的视频URL
+        video_urls = set()
+        # 1. 提取video标签中的视频
+        for video_tag in soup.find_all('video'):
+            if 'src' in video_tag.attrs:
+                video_url = urljoin(url, video_tag['src'])
+                video_urls.add(video_url)
+            # 提取source标签中的视频
+            for source_tag in video_tag.find_all('source'):
+                if 'src' in source_tag.attrs:
+                    video_url = urljoin(url, source_tag['src'])
+                    video_urls.add(video_url)
+        # 2. 提取iframe中的视频链接
+        for iframe_tag in soup.find_all('iframe'):
+            if 'src' in iframe_tag.attrs:
+                iframe_url = urljoin(url, iframe_tag['src'])
+                # 简单检查iframe是否可能包含视频
+                if any(fmt in iframe_url.lower() for fmt in SUPPORTED_FORMATS) or 'video' in iframe_url.lower():
+                    video_urls.add(iframe_url)
+        # 3. 使用正则表达式查找可能的视频URL
+        url_pattern = r'https?://[^\s"\']+'
+        matches = re.findall(url_pattern, html_content)
+        for match in matches:
+            # 检查是否是视频文件
+            if any(match.lower().endswith(fmt) for fmt in SUPPORTED_FORMATS):
+                video_urls.add(match)
+        # 4. 查找可能的视频API链接
+        api_pattern = r'https?://[^\s"\']+/video/[^\s"\']+'
+        api_matches = re.findall(api_pattern, html_content)
+        for api_match in api_matches:
+            video_urls.add(api_match)
+        # 整理结果
+        if video_urls:
+            # 按格式分类
+            format_counts = {}
+            for fmt in SUPPORTED_FORMATS:
+                count = sum(1 for url in video_urls if url.lower().endswith(fmt))
+                if count > 0:
+                    format_counts[fmt] = count
+            # 生成格式汇总信息
+            format_info = "📊 检测到的视频格式: "
+            format_info += ", ".join([f"{k} ({v}个)" for k, v in format_counts.items()])
+            # 生成视频URL列表
+            url_list = "\n\n📋 提取到的视频地址:\n"
+            for i, video_url in enumerate(sorted(video_urls), 1):
+                url_list += f"{i}. {video_url}\n"
+            return format_info + url_list
         else:
+            return "❌ 未找到任何视频地址。可能是该网页没有视频，或者视频采用了特殊方式加载。"
     except requests.exceptions.Timeout:
+        return "⏱️ 请求超时，请检查URL是否正确或稍后再试。"
     except requests.exceptions.RequestException as e:
         return f"❌ 请求失败: {str(e)}"
     except Exception as e:
+        return f"❌ 处理出错: {str(e)}"
+def extract_with_progress(url):
+    """带进度显示的提取函数"""
+    progress = gr.Progress()
+    progress(0, desc="开始处理...")
+    time.sleep(0.5)
+    progress(0.3, desc="正在请求网页内容...")
+    result = extract_video_urls(url)
+    progress(0.8, desc="正在整理结果...")
+    time.sleep(0.5)
+    progress(1.0, desc="完成!")
+    return result
+# 创建Gradio界面
+with gr.Blocks(title="视频地址提取工具", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🔍 视频地址提取工具")
+    gr.Markdown("输入包含视频的网页URL，提取该页面中所有视频的真实地址。")
     with gr.Row():
+        with gr.Column(scale=3):
+            url_input = gr.Textbox(
+                label="网页URL",
+                placeholder="例如: https://example.com/video-page",
+                lines=1
+            )
+        with gr.Column(scale=1):
+            extract_btn = gr.Button("提取视频地址", variant="primary", size="lg")
     result_output = gr.Textbox(
         label="提取结果",
         lines=10,
+        interactive=False
     )
+    gr.Markdown("""
+    ### 使用说明
+    1. 请输入完整的网页URL（必须包含http://或https://）
+    2. 部分网站可能因防盗链或加密措施无法提取视频
+    3. 提取结果包含视频格式统计和完整视频地址列表
+    """)
+    # 设置事件
     extract_btn.click(
+        fn=extract_with_progress,
+        inputs=[url_input],
+        outputs=[result_output]
     )
+    # 支持回车键提交
     url_input.submit(
+        fn=extract_with_progress,
+        inputs=[url_input],
+        outputs=[result_output]
     )
 # 启动应用
 if __name__ == "__main__":
     demo.launch()