import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import time

# 支持的视频格式
SUPPORTED_FORMATS = {'.mp4', '.webm', '.mov', '.avi', '.mkv', '.flv', '.wmv', '.mpeg', '.mpg'}

def is_valid_url(url):
    """检查URL是否有效"""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

def extract_video_urls(url):
    """从给定URL提取所有视频地址"""
    if not is_valid_url(url):
        return "❌ 无效的URL，请确保包含http://或https://"
    
    try:
        # 设置请求头，模拟浏览器访问
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # 发送请求获取网页内容
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # 检查请求是否成功
        html_content = response.text
        
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # 存储提取到的视频URL
        video_urls = set()
        
        # 1. 提取video标签中的视频
        for video_tag in soup.find_all('video'):
            if 'src' in video_tag.attrs:
                video_url = urljoin(url, video_tag['src'])
                video_urls.add(video_url)
            
            # 提取source标签中的视频
            for source_tag in video_tag.find_all('source'):
                if 'src' in source_tag.attrs:
                    video_url = urljoin(url, source_tag['src'])
                    video_urls.add(video_url)
        
        # 2. 提取iframe中的视频链接
        for iframe_tag in soup.find_all('iframe'):
            if 'src' in iframe_tag.attrs:
                iframe_url = urljoin(url, iframe_tag['src'])
                # 简单检查iframe是否可能包含视频
                if any(fmt in iframe_url.lower() for fmt in SUPPORTED_FORMATS) or 'video' in iframe_url.lower():
                    video_urls.add(iframe_url)
        
        # 3. 使用正则表达式查找可能的视频URL
        url_pattern = r'https?://[^\s"\']+'
        matches = re.findall(url_pattern, html_content)
        for match in matches:
            # 检查是否是视频文件
            if any(match.lower().endswith(fmt) for fmt in SUPPORTED_FORMATS):
                video_urls.add(match)
        
        # 4. 查找可能的视频API链接
        api_pattern = r'https?://[^\s"\']+/video/[^\s"\']+'
        api_matches = re.findall(api_pattern, html_content)
        for api_match in api_matches:
            video_urls.add(api_match)
        
        # 整理结果
        if video_urls:
            # 按格式分类
            format_counts = {}
            for fmt in SUPPORTED_FORMATS:
                count = sum(1 for url in video_urls if url.lower().endswith(fmt))
                if count > 0:
                    format_counts[fmt] = count
            
            # 生成格式汇总信息
            format_info = "📊 检测到的视频格式: "
            format_info += ", ".join([f"{k} ({v}个)" for k, v in format_counts.items()])
            
            # 生成视频URL列表
            url_list = "\n\n📋 提取到的视频地址:\n"
            for i, video_url in enumerate(sorted(video_urls), 1):
                url_list += f"{i}. {video_url}\n"
            
            return format_info + url_list
        else:
            return "❌ 未找到任何视频地址。可能是该网页没有视频，或者视频采用了特殊方式加载。"
            
    except requests.exceptions.Timeout:
        return "⏱️ 请求超时，请检查URL是否正确或稍后再试。"
    except requests.exceptions.RequestException as e:
        return f"❌ 请求失败: {str(e)}"
    except Exception as e:
        return f"❌ 处理出错: {str(e)}"

def extract_with_progress(url):
    """带进度显示的提取函数"""
    progress = gr.Progress()
    progress(0, desc="开始处理...")
    time.sleep(0.5)
    
    progress(0.3, desc="正在请求网页内容...")
    result = extract_video_urls(url)
    
    progress(0.8, desc="正在整理结果...")
    time.sleep(0.5)
    
    progress(1.0, desc="完成!")
    return result

# 创建Gradio界面
with gr.Blocks(title="视频地址提取工具", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🔍 视频地址提取工具")
    gr.Markdown("输入包含视频的网页URL，提取该页面中所有视频的真实地址。")
    
    with gr.Row():
        with gr.Column(scale=3):
            url_input = gr.Textbox(
                label="网页URL", 
                placeholder="例如: https://example.com/video-page",
                lines=1
            )
        with gr.Column(scale=1):
            extract_btn = gr.Button("提取视频地址", variant="primary", size="lg")
    
    result_output = gr.Textbox(
        label="提取结果", 
        lines=10,
        interactive=False
    )
    
    gr.Markdown("""
    ### 使用说明
    1. 请输入完整的网页URL（必须包含http://或https://）
    2. 部分网站可能因防盗链或加密措施无法提取视频
    3. 提取结果包含视频格式统计和完整视频地址列表
    """)
    
    # 设置事件
    extract_btn.click(
        fn=extract_with_progress,
        inputs=[url_input],
        outputs=[result_output]
    )
    
    # 支持回车键提交
    url_input.submit(
        fn=extract_with_progress,
        inputs=[url_input],
        outputs=[result_output]
    )

# 启动应用
if __name__ == "__main__":
    demo.launch()