import gradio as gr import requests from bs4 import BeautifulSoup import re from urllib.parse import urljoin, urlparse import time # 支持的视频格式 SUPPORTED_FORMATS = {'.mp4', '.webm', '.mov', '.avi', '.mkv', '.flv', '.wmv', '.mpeg', '.mpg'} def is_valid_url(url): """检查URL是否有效""" try: result = urlparse(url) return all([result.scheme, result.netloc]) except: return False def extract_video_urls(url): """从给定URL提取所有视频地址""" if not is_valid_url(url): return "❌ 无效的URL,请确保包含http://或https://" try: # 设置请求头,模拟浏览器访问 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } # 发送请求获取网页内容 response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # 检查请求是否成功 html_content = response.text # 使用BeautifulSoup解析HTML soup = BeautifulSoup(html_content, 'html.parser') # 存储提取到的视频URL video_urls = set() # 1. 提取video标签中的视频 for video_tag in soup.find_all('video'): if 'src' in video_tag.attrs: video_url = urljoin(url, video_tag['src']) video_urls.add(video_url) # 提取source标签中的视频 for source_tag in video_tag.find_all('source'): if 'src' in source_tag.attrs: video_url = urljoin(url, source_tag['src']) video_urls.add(video_url) # 2. 提取iframe中的视频链接 for iframe_tag in soup.find_all('iframe'): if 'src' in iframe_tag.attrs: iframe_url = urljoin(url, iframe_tag['src']) # 简单检查iframe是否可能包含视频 if any(fmt in iframe_url.lower() for fmt in SUPPORTED_FORMATS) or 'video' in iframe_url.lower(): video_urls.add(iframe_url) # 3. 使用正则表达式查找可能的视频URL url_pattern = r'https?://[^\s"\']+' matches = re.findall(url_pattern, html_content) for match in matches: # 检查是否是视频文件 if any(match.lower().endswith(fmt) for fmt in SUPPORTED_FORMATS): video_urls.add(match) # 4. 查找可能的视频API链接 api_pattern = r'https?://[^\s"\']+/video/[^\s"\']+' api_matches = re.findall(api_pattern, html_content) for api_match in api_matches: video_urls.add(api_match) # 整理结果 if video_urls: # 按格式分类 format_counts = {} for fmt in SUPPORTED_FORMATS: count = sum(1 for url in video_urls if url.lower().endswith(fmt)) if count > 0: format_counts[fmt] = count # 生成格式汇总信息 format_info = "📊 检测到的视频格式: " format_info += ", ".join([f"{k} ({v}个)" for k, v in format_counts.items()]) # 生成视频URL列表 url_list = "\n\n📋 提取到的视频地址:\n" for i, video_url in enumerate(sorted(video_urls), 1): url_list += f"{i}. {video_url}\n" return format_info + url_list else: return "❌ 未找到任何视频地址。可能是该网页没有视频,或者视频采用了特殊方式加载。" except requests.exceptions.Timeout: return "⏱️ 请求超时,请检查URL是否正确或稍后再试。" except requests.exceptions.RequestException as e: return f"❌ 请求失败: {str(e)}" except Exception as e: return f"❌ 处理出错: {str(e)}" def extract_with_progress(url): """带进度显示的提取函数""" progress = gr.Progress() progress(0, desc="开始处理...") time.sleep(0.5) progress(0.3, desc="正在请求网页内容...") result = extract_video_urls(url) progress(0.8, desc="正在整理结果...") time.sleep(0.5) progress(1.0, desc="完成!") return result # 创建Gradio界面 with gr.Blocks(title="视频地址提取工具", theme=gr.themes.Soft()) as demo: gr.Markdown("## 🔍 视频地址提取工具") gr.Markdown("输入包含视频的网页URL,提取该页面中所有视频的真实地址。") with gr.Row(): with gr.Column(scale=3): url_input = gr.Textbox( label="网页URL", placeholder="例如: https://example.com/video-page", lines=1 ) with gr.Column(scale=1): extract_btn = gr.Button("提取视频地址", variant="primary", size="lg") result_output = gr.Textbox( label="提取结果", lines=10, interactive=False ) gr.Markdown(""" ### 使用说明 1. 请输入完整的网页URL(必须包含http://或https://) 2. 部分网站可能因防盗链或加密措施无法提取视频 3. 提取结果包含视频格式统计和完整视频地址列表 """) # 设置事件 extract_btn.click( fn=extract_with_progress, inputs=[url_input], outputs=[result_output] ) # 支持回车键提交 url_input.submit( fn=extract_with_progress, inputs=[url_input], outputs=[result_output] ) # 启动应用 if __name__ == "__main__": demo.launch()