Spaces:

wzy013
/

hunyuanvideo-foley

Running

File size: 11,621 Bytes

21d1989

import os
import tempfile
import gradio as gr
import requests
import json
from loguru import logger
from typing import Optional, Tuple
import base64
import time

def call_gradio_client_api(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
    """调用官方Hugging Face Space的API"""
    try:
        from gradio_client import Client
        
        logger.info("连接到官方 HunyuanVideo-Foley Space...")
        
        # 连接到官方Space
        client = Client("tencent/HunyuanVideo-Foley")
        
        logger.info("发送推理请求...")
        
        # 调用推理函数
        result = client.predict(
            video_file,  # 视频文件
            text_prompt,  # 文本提示
            guidance_scale,  # CFG scale
            inference_steps,  # 推理步数
            sample_nums,  # 样本数量
            api_name="/infer_single_video"  # API端点名称
        )
        
        return result, "✅ 成功通过官方API生成音频!"
        
    except Exception as e:
        error_msg = str(e)
        logger.error(f"Gradio Client API 调用失败: {error_msg}")
        
        if "not found" in error_msg.lower():
            return None, "❌ 官方Space的API端点未找到，可能接口已更改"
        elif "connection" in error_msg.lower():
            return None, "❌ 无法连接到官方Space，请检查网络"
        elif "queue" in error_msg.lower():
            return None, "⏳ 官方Space繁忙，请稍后重试"
        else:
            return None, f"❌ API调用错误: {error_msg}"

def call_huggingface_inference_api(video_file, text_prompt):
    """调用Hugging Face Inference API"""
    try:
        logger.info("尝试Hugging Face Inference API...")
        
        API_URL = "https://api-inference.huggingface.co/models/tencent/HunyuanVideo-Foley"
        
        # 读取视频文件
        with open(video_file, "rb") as f:
            video_data = f.read()
        
        # 准备请求数据
        headers = {
            "Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}",
        }
        
        # 发送请求
        response = requests.post(
            API_URL,
            headers=headers,
            json={"inputs": {"video": base64.b64encode(video_data).decode(), "text": text_prompt}},
            timeout=300
        )
        
        if response.status_code == 200:
            # 保存结果
            temp_dir = tempfile.mkdtemp()
            audio_path = os.path.join(temp_dir, "generated_audio.wav")
            with open(audio_path, 'wb') as f:
                f.write(response.content)
            return [audio_path], "✅ 通过Hugging Face API生成成功!"
        else:
            logger.error(f"HF API错误: {response.status_code}")
            return None, f"❌ Hugging Face API返回错误: {response.status_code}"
            
    except Exception as e:
        logger.error(f"HF API调用失败: {str(e)}")
        return None, f"❌ Hugging Face API调用失败: {str(e)}"

def try_alternative_apis(video_file, text_prompt):
    """尝试其他可能的API服务"""
    
    # 1. 尝试通过公开的demo接口
    try:
        logger.info("尝试demo接口...")
        
        # 这里可以尝试其他公开的API服务
        # 比如Replicate、RunPod等
        
        return None, "❌ 暂无可用的替代API服务"
        
    except Exception as e:
        return None, f"❌ 替代API调用失败: {str(e)}"

def smart_api_inference(video_file, text_prompt, guidance_scale=4.5, inference_steps=50, sample_nums=1):
    """智能API推理 - 尝试多种API调用方式"""
    
    if video_file is None:
        return [], "❌ 请上传视频文件!"
    
    if not text_prompt:
        text_prompt = "audio for this video"
    
    logger.info(f"开始API推理: {video_file}")
    logger.info(f"文本提示: {text_prompt}")
    
    status_updates = []
    
    # 方法1: 尝试Gradio Client (最可能成功)
    status_updates.append("🔄 尝试连接官方Space API...")
    try:
        result, status = call_gradio_client_api(
            video_file, text_prompt, guidance_scale, inference_steps, sample_nums
        )
        if result:
            return result, "\n".join(status_updates + [status])
        status_updates.append(status)
    except ImportError:
        status_updates.append("⚠️ gradio_client未安装，跳过官方API调用")
    
    # 方法2: 尝试Hugging Face Inference API
    status_updates.append("🔄 尝试Hugging Face Inference API...")
    result, status = call_huggingface_inference_api(video_file, text_prompt)
    if result:
        return result, "\n".join(status_updates + [status])
    status_updates.append(status)
    
    # 方法3: 尝试其他API
    status_updates.append("🔄 尝试替代API服务...")
    result, status = try_alternative_apis(video_file, text_prompt)
    status_updates.append(status)
    
    # 所有方法都失败了
    final_message = "\n".join(status_updates + [
        "",
        "💡 **解决方案建议:**",
        "• 安装 gradio_client: pip install gradio_client",
        "• 配置 HF_TOKEN 环境变量",
        "• 等待官方Space负载降低",
        "• 本地运行完整模型(需24GB+ RAM)",
        "",
        "🔗 **官方Space**: https://huggingface.co/spaces/tencent/HunyuanVideo-Foley"
    ])
    
    return [], final_message

def create_real_api_interface():
    """创建真实API调用界面"""
    
    css = """
    .api-status {
        background: #f0f8ff;
        border: 2px solid #4169e1;
        border-radius: 10px;
        padding: 1rem;
        margin: 1rem 0;
        color: #191970;
    }
    """
    
    with gr.Blocks(css=css, title="HunyuanVideo-Foley API Client") as app:
        
        # Header
        gr.HTML("""
        <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 20px; margin-bottom: 2rem; color: white;">
            <h1>🎵 HunyuanVideo-Foley</h1>
            <p>API客户端 - 调用真实模型推理</p>
        </div>
        """)
        
        # API Status Notice
        gr.HTML("""
        <div class="api-status">
            <strong>🌐 真实API调用模式:</strong> 这个版本会通过API调用真实的HunyuanVideo-Foley模型进行推理。
            <br><strong>优点:</strong> 真实AI音频生成，无需本地大内存
            <br><strong>缺点:</strong> 依赖外部服务可用性，可能需要等待队列
        </div>
        """)
        
        with gr.Row():
            # 输入区域
            with gr.Column(scale=1):
                gr.Markdown("### 📹 视频输入")
                
                video_input = gr.Video(
                    label="上传视频",
                    info="支持MP4、AVI、MOV等格式"
                )
                
                text_input = gr.Textbox(
                    label="🎯 音频描述",
                    placeholder="描述你想要的音频效果，例如：脚步声、雨声、车辆行驶等",
                    lines=3,
                    value="audio sound effects for this video"
                )
                
                with gr.Row():
                    guidance_scale = gr.Slider(
                        minimum=1.0,
                        maximum=10.0,
                        value=4.5,
                        step=0.1,
                        label="🎚️ CFG Scale"
                    )
                    
                    inference_steps = gr.Slider(
                        minimum=10,
                        maximum=100,
                        value=50,
                        step=5,
                        label="⚡ 推理步数"
                    )
                    
                    sample_nums = gr.Slider(
                        minimum=1,
                        maximum=6,
                        value=1,
                        step=1,
                        label="🎲 样本数量"
                    )
                
                generate_btn = gr.Button(
                    "🎵 调用API生成音频", 
                    variant="primary",
                    size="lg"
                )
            
            # 输出区域
            with gr.Column(scale=1):
                gr.Markdown("### 🎵 生成结果")
                
                audio_outputs = []
                for i in range(6):
                    audio_output = gr.Audio(
                        label=f"样本 {i+1}",
                        visible=(i == 0)  # 只显示第一个
                    )
                    audio_outputs.append(audio_output)
                
                status_output = gr.Textbox(
                    label="API状态",
                    interactive=False,
                    lines=10,
                    placeholder="等待API调用..."
                )
        
        # 事件处理
        def process_with_api(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
            # 调用API推理
            results, status_msg = smart_api_inference(
                video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
            )
            
            # 准备输出
            outputs = [None] * 6
            visibilities = [False] * 6
            
            if results and isinstance(results, list):
                for i, result in enumerate(results[:6]):
                    outputs[i] = result
                    visibilities[i] = True
            
            return outputs + visibilities + [status_msg]
        
        # 动态显示样本数量
        def update_visibility(sample_nums):
            sample_nums = int(sample_nums)
            return [gr.update(visible=(i < sample_nums)) for i in range(6)]
        
        # 连接事件
        sample_nums.change(
            fn=update_visibility,
            inputs=[sample_nums],
            outputs=audio_outputs
        )
        
        generate_btn.click(
            fn=process_with_api,
            inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
            outputs=audio_outputs + [gr.update(visible=(i < 6)) for i in range(6)] + [status_output]
        )
        
        # Footer
        gr.HTML("""
        <div style="text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem;">
            <p><strong>📡 API调用版本</strong> - 通过网络调用真实模型进行推理</p>
            <p>🔗 官方Space: <a href="https://huggingface.co/spaces/tencent/HunyuanVideo-Foley" target="_blank">tencent/HunyuanVideo-Foley</a></p>
            <p>⚠️ 需要安装: <code>pip install gradio_client</code></p>
        </div>
        """)
    
    return app

if __name__ == "__main__":
    # 设置日志
    logger.remove()
    logger.add(lambda msg: print(msg, end=''), level="INFO")
    
    logger.info("启动 HunyuanVideo-Foley API 客户端...")
    
    # 检查依赖
    try:
        import gradio_client
        logger.info("✅ gradio_client 已安装")
    except ImportError:
        logger.warning("⚠️ gradio_client 未安装，API调用功能可能受限")
    
    # 创建并启动应用
    app = create_real_api_interface()
    
    logger.info("API客户端就绪，准备调用真实模型...")
    
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=False,
        show_error=True
    )