wkplhc commited on
Commit
08dd4ad
·
verified ·
1 Parent(s): 17ca1df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -87
app.py CHANGED
@@ -3,6 +3,10 @@ import requests
3
  from bs4 import BeautifulSoup
4
  import re
5
  from urllib.parse import urljoin, urlparse
 
 
 
 
6
 
7
  def is_valid_url(url):
8
  """检查URL是否有效"""
@@ -13,133 +17,148 @@ def is_valid_url(url):
13
  return False
14
 
15
  def extract_video_urls(url):
16
- """从给定URL提取视频地址"""
17
  if not is_valid_url(url):
18
- return "❌ 无效的URL,请检查格式是否正(需包含http://或https://"
19
 
20
  try:
21
  # 设置请求头,模拟浏览器访问
22
  headers = {
23
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
24
  }
25
 
26
  # 发送请求获取网页内容
27
  response = requests.get(url, headers=headers, timeout=10)
28
- response.raise_for_status() # 抛出HTTP错误
 
29
 
30
- # 解析HTML
31
- soup = BeautifulSoup(response.text, 'html.parser')
32
 
33
  # 存储提取到的视频URL
34
- video_urls = []
35
-
36
- # 1. 从video标签提取
37
- video_tags = soup.find_all('video')
38
- for tag in video_tags:
39
- if 'src' in tag.attrs:
40
- video_url = urljoin(url, tag['src'])
41
- video_urls.append(f"🎬 视频标签: {video_url}")
42
-
43
- # 2. 从source标签提取
44
- source_tags = soup.find_all('source')
45
- for tag in source_tags:
46
- if 'src' in tag.attrs:
47
- video_url = urljoin(url, tag['src'])
48
- video_urls.append(f"📽️ 源标签: {video_url}")
49
 
50
- # 3. 从iframe标签提取可能包含视频的链接
51
- iframe_tags = soup.find_all('iframe')
52
- for tag in iframe_tags:
53
- if 'src' in tag.attrs:
54
- iframe_url = urljoin(url, tag['src'])
55
- video_urls.append(f"🔗 嵌入框架: {iframe_url}")
 
 
 
 
 
56
 
57
- # 4. 搜索可能的视频URL模式
58
- video_patterns = [
59
- r'https?://[^"\']+\.(mp4|webm|mov|avi|flv|mkv)',
60
- r'src=[\'"](https?://[^"\']+\.(mp4|webm|mov|avi|flv|mkv))[\'"]'
61
- ]
 
 
62
 
63
- for pattern in video_patterns:
64
- matches = re.findall(pattern, response.text)
65
- for match in matches:
66
- video_url = match[0] if isinstance(match, tuple) else match
67
- if video_url not in [u.split(": ", 1)[1] for u in video_urls]:
68
- video_urls.append(f"🔍 检测到视频: {video_url}")
 
69
 
70
- # 去重处理
71
- unique_urls = []
72
- seen = set()
73
- for url_entry in video_urls:
74
- url_part = url_entry.split(": ", 1)[1]
75
- if url_part not in seen:
76
- seen.add(url_part)
77
- unique_urls.append(url_entry)
78
 
79
- if not unique_urls:
80
- return "ℹ️ 未在该网页中找到视频地址"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  else:
82
- return "\n\n".join(unique_urls)
83
 
84
  except requests.exceptions.Timeout:
85
- return "⏱️ 请求超时,请稍后再试"
86
- except requests.exceptions.HTTPError as e:
87
- return f"❌ HTTP错误: {str(e)}"
88
  except requests.exceptions.RequestException as e:
89
  return f"❌ 请求失败: {str(e)}"
90
  except Exception as e:
91
- return f"❌ 解析: {str(e)}"
92
 
93
- # 创建Gradio界面
94
- with gr.Blocks(title="视频地址提取工具", theme=gr.themes.Soft()) as demo:
95
- gr.Markdown("""
96
- # 🎥 视频地址提取工具
 
97
 
98
- 输入包含视频的网页URL,提取该页面中所有视频的真实地址。
 
99
 
100
- 使用说明:
101
- 1. 输入完整的网页URL(需包含http://或https://)
102
- 2. 点击"提取视频地址"按钮
103
- 3. 等待解析完成后查看结果
104
- """)
 
 
 
 
 
105
 
106
  with gr.Row():
107
- url_input = gr.Textbox(
108
- label="网页URL",
109
- placeholder="例如: https://example.com/video-page",
110
- lines=1,
111
- container=True
112
- )
113
-
114
- extract_btn = gr.Button("提取视频地址", variant="primary")
115
 
116
  result_output = gr.Textbox(
117
  label="提取结果",
118
  lines=10,
119
- container=True
120
  )
121
 
122
- # 设置按钮点击事件
 
 
 
 
 
 
 
123
  extract_btn.click(
124
- fn=extract_video_urls,
125
- inputs=url_input,
126
- outputs=result_output
127
  )
128
 
129
- # 设置回车键触发
130
  url_input.submit(
131
- fn=extract_video_urls,
132
- inputs=url_input,
133
- outputs=result_output
134
  )
135
-
136
- gr.Markdown("""
137
- ⚠️ 注意:
138
- - 部分网站可能因防盗链或权限限制无法提取视频
139
- - 提取结果仅供学习研究使用
140
- - 大型网页可能需要较长解析时间
141
- """)
142
 
143
  # 启动应用
144
  if __name__ == "__main__":
145
  demo.launch()
 
 
3
  from bs4 import BeautifulSoup
4
  import re
5
  from urllib.parse import urljoin, urlparse
6
+ import time
7
+
8
+ # 支持的视频格式
9
+ SUPPORTED_FORMATS = {'.mp4', '.webm', '.mov', '.avi', '.mkv', '.flv', '.wmv', '.mpeg', '.mpg'}
10
 
11
  def is_valid_url(url):
12
  """检查URL是否有效"""
 
17
  return False
18
 
19
  def extract_video_urls(url):
20
+ """从给定URL提取所有视频地址"""
21
  if not is_valid_url(url):
22
+ return "❌ 无效的URL,请确包含http://或https://"
23
 
24
  try:
25
  # 设置请求头,模拟浏览器访问
26
  headers = {
27
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
28
  }
29
 
30
  # 发送请求获取网页内容
31
  response = requests.get(url, headers=headers, timeout=10)
32
+ response.raise_for_status() # 检查请求是否成功
33
+ html_content = response.text
34
 
35
+ # 使用BeautifulSoup解析HTML
36
+ soup = BeautifulSoup(html_content, 'html.parser')
37
 
38
  # 存储提取到的视频URL
39
+ video_urls = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # 1. 提取video标签中的视频
42
+ for video_tag in soup.find_all('video'):
43
+ if 'src' in video_tag.attrs:
44
+ video_url = urljoin(url, video_tag['src'])
45
+ video_urls.add(video_url)
46
+
47
+ # 提取source标签中的视频
48
+ for source_tag in video_tag.find_all('source'):
49
+ if 'src' in source_tag.attrs:
50
+ video_url = urljoin(url, source_tag['src'])
51
+ video_urls.add(video_url)
52
 
53
+ # 2. 提取iframe中的视频链接
54
+ for iframe_tag in soup.find_all('iframe'):
55
+ if 'src' in iframe_tag.attrs:
56
+ iframe_url = urljoin(url, iframe_tag['src'])
57
+ # 简单检查iframe是否可能包含视频
58
+ if any(fmt in iframe_url.lower() for fmt in SUPPORTED_FORMATS) or 'video' in iframe_url.lower():
59
+ video_urls.add(iframe_url)
60
 
61
+ # 3. 使用正则表达式查找可能的视频URL
62
+ url_pattern = r'https?://[^\s"\']+'
63
+ matches = re.findall(url_pattern, html_content)
64
+ for match in matches:
65
+ # 检查是否是视频文件
66
+ if any(match.lower().endswith(fmt) for fmt in SUPPORTED_FORMATS):
67
+ video_urls.add(match)
68
 
69
+ # 4. 查找可能的视频API链接
70
+ api_pattern = r'https?://[^\s"\']+/video/[^\s"\']+'
71
+ api_matches = re.findall(api_pattern, html_content)
72
+ for api_match in api_matches:
73
+ video_urls.add(api_match)
 
 
 
74
 
75
+ # 整理结果
76
+ if video_urls:
77
+ # 按格式分类
78
+ format_counts = {}
79
+ for fmt in SUPPORTED_FORMATS:
80
+ count = sum(1 for url in video_urls if url.lower().endswith(fmt))
81
+ if count > 0:
82
+ format_counts[fmt] = count
83
+
84
+ # 生成格式汇总信息
85
+ format_info = "📊 检测到的视频格式: "
86
+ format_info += ", ".join([f"{k} ({v}个)" for k, v in format_counts.items()])
87
+
88
+ # 生成视频URL列表
89
+ url_list = "\n\n📋 提取到的视频地址:\n"
90
+ for i, video_url in enumerate(sorted(video_urls), 1):
91
+ url_list += f"{i}. {video_url}\n"
92
+
93
+ return format_info + url_list
94
  else:
95
+ return "❌ 未找到任何视频地址。可能是该网页没有视频,或者视频采用了特殊方式加载。"
96
 
97
  except requests.exceptions.Timeout:
98
+ return "⏱️ 请求超时,请检查URL是否正确或稍后再试"
 
 
99
  except requests.exceptions.RequestException as e:
100
  return f"❌ 请求失败: {str(e)}"
101
  except Exception as e:
102
+ return f"❌ 处理出错: {str(e)}"
103
 
104
+ def extract_with_progress(url):
105
+ """带进度显示的提取函数"""
106
+ progress = gr.Progress()
107
+ progress(0, desc="开始处理...")
108
+ time.sleep(0.5)
109
 
110
+ progress(0.3, desc="正在请求网页内容...")
111
+ result = extract_video_urls(url)
112
 
113
+ progress(0.8, desc="正在整理结果...")
114
+ time.sleep(0.5)
115
+
116
+ progress(1.0, desc="完成!")
117
+ return result
118
+
119
+ # 创建Gradio界面
120
+ with gr.Blocks(title="视频地址提取工具", theme=gr.themes.Soft()) as demo:
121
+ gr.Markdown("## 🔍 视频地址提取工具")
122
+ gr.Markdown("输入包含视频的网页URL,提取该页面中所有视频的真实地址。")
123
 
124
  with gr.Row():
125
+ with gr.Column(scale=3):
126
+ url_input = gr.Textbox(
127
+ label="网页URL",
128
+ placeholder="例如: https://example.com/video-page",
129
+ lines=1
130
+ )
131
+ with gr.Column(scale=1):
132
+ extract_btn = gr.Button("提取视频地址", variant="primary", size="lg")
133
 
134
  result_output = gr.Textbox(
135
  label="提取结果",
136
  lines=10,
137
+ interactive=False
138
  )
139
 
140
+ gr.Markdown("""
141
+ ### 使用说明
142
+ 1. 请输入完整的网页URL(必须包含http://或https://)
143
+ 2. 部分网站可能因防盗链或加密措施无法提取视频
144
+ 3. 提取结果包含视频格式统计和完整视频地址列表
145
+ """)
146
+
147
+ # 设置事件
148
  extract_btn.click(
149
+ fn=extract_with_progress,
150
+ inputs=[url_input],
151
+ outputs=[result_output]
152
  )
153
 
154
+ # 支持回车键提
155
  url_input.submit(
156
+ fn=extract_with_progress,
157
+ inputs=[url_input],
158
+ outputs=[result_output]
159
  )
 
 
 
 
 
 
 
160
 
161
  # 启动应用
162
  if __name__ == "__main__":
163
  demo.launch()
164
+