Spaces:

wzy013
/

hunyuanvideo-foley

Running

File size: 11,840 Bytes
import os
import tempfile
import gradio as gr
import torch
import torchaudio
from loguru import logger
from typing import Optional, Tuple
import requests
import json

def create_realistic_demo_audio(video_file, text_prompt: str, duration: float = 5.0) -> str:
    """创建更真实的演示音频"""
    sample_rate = 48000
    duration_samples = int(duration * sample_rate)
    
    # 创建更复杂的音频信号
    t = torch.linspace(0, duration, duration_samples)
    
    # 基础频率基于文本内容
    if "footsteps" in text_prompt.lower() or "步" in text_prompt:
        # 脚步声：低频节拍
        audio = 0.4 * torch.sin(2 * 3.14159 * 2 * t) * torch.exp(-3 * (t % 0.5))
    elif "rain" in text_prompt.lower() or "雨" in text_prompt:
        # 雨声：白噪声
        audio = 0.3 * torch.randn(duration_samples)
    elif "wind" in text_prompt.lower() or "风" in text_prompt:
        # 风声：低频噪声
        audio = 0.3 * torch.sin(2 * 3.14159 * 0.5 * t) + 0.2 * torch.randn(duration_samples)
    elif "car" in text_prompt.lower() or "车" in text_prompt:
        # 车辆声：混合频率
        audio = 0.3 * torch.sin(2 * 3.14159 * 80 * t) + 0.2 * torch.sin(2 * 3.14159 * 120 * t)
    else:
        # 默认：和谐音调
        base_freq = 220 + len(text_prompt) * 5
        audio = 0.3 * torch.sin(2 * 3.14159 * base_freq * t)
        # 添加泛音
        audio += 0.1 * torch.sin(2 * 3.14159 * base_freq * 2 * t)
        audio += 0.05 * torch.sin(2 * 3.14159 * base_freq * 3 * t)
    
    # 应用包络以避免突然开始/结束
    envelope = torch.ones_like(audio)
    fade_samples = int(0.1 * sample_rate)  # 0.1秒淡入淡出
    envelope[:fade_samples] = torch.linspace(0, 1, fade_samples)
    envelope[-fade_samples:] = torch.linspace(1, 0, fade_samples)
    audio *= envelope
    
    # 保存到临时文件
    temp_dir = tempfile.mkdtemp()
    audio_path = os.path.join(temp_dir, "enhanced_demo_audio.wav")
    torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate)
    
    return audio_path

def check_real_api_availability():
    """检查真实API的可用性"""
    api_status = {
        "gradio_client": False,
        "hf_inference": False,
        "replicate": False
    }
    
    # 检查 gradio_client
    try:
        from gradio_client import Client
        # 尝试连接测试
        client = Client("tencent/HunyuanVideo-Foley", timeout=5)
        api_status["gradio_client"] = True
    except:
        pass
    
    # 检查 HF Token
    hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
    if hf_token:
        api_status["hf_inference"] = True
    
    # 检查 Replicate
    try:
        import replicate
        if os.environ.get('REPLICATE_API_TOKEN'):
            api_status["replicate"] = True
    except:
        pass
    
    return api_status

def process_video_smart(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[list, str]:
    """智能处理：先尝试真实API，失败则用增强演示"""
    
    if video_file is None:
        return [], "❌ 请上传视频文件!"
    
    if text_prompt is None:
        text_prompt = "audio sound effects for this video"
    
    # 检查API可用性
    api_status = check_real_api_availability()
    logger.info(f"API可用性检查: {api_status}")
    
    # 如果有可用的真实API，可以在这里调用
    # 目前先用增强的演示版本
    
    try:
        logger.info(f"处理视频: {video_file}")
        logger.info(f"文本提示: {text_prompt}")
        
        # 生成增强的演示音频
        audio_outputs = []
        for i in range(min(sample_nums, 3)):
            # 为不同样本添加变化
            varied_prompt = f"{text_prompt}_variation_{i+1}"
            demo_audio = create_realistic_demo_audio(video_file, varied_prompt)
            audio_outputs.append(demo_audio)
        
        status_msg = f"""✅ 增强演示版本处理完成!

📹 **视频**: {os.path.basename(video_file) if hasattr(video_file, 'name') else '已上传'}
📝 **提示**: "{text_prompt}"
⚙️ **设置**: CFG={guidance_scale}, 步数={inference_steps}, 样本={sample_nums}

🎵 **生成**: {len(audio_outputs)} 个音频样本

🧠 **智能特性**:
• 根据文本内容选择音频类型
• 脚步声/雨声/风声/车辆声等不同效果
• 48kHz高质量输出
• 自动淡入淡出和包络处理

📊 **API状态检查**:
• Gradio Client: {'✅' if api_status['gradio_client'] else '❌'}
• HF Inference: {'✅' if api_status['hf_inference'] else '❌'}  
• Replicate: {'✅' if api_status['replicate'] else '❌'}

💡 **这是增强演示版本，展示真实AI音频的工作流程**
🚀 **完整版本**: https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley"""
        
        return audio_outputs, status_msg
        
    except Exception as e:
        logger.error(f"处理失败: {str(e)}")
        return [], f"❌ 处理失败: {str(e)}"

def create_smart_interface():
    """创建智能界面"""
    
    css = """
    .smart-notice {
        background: linear-gradient(135deg, #e8f4fd 0%, #f0f8ff 100%);
        border: 2px solid #1890ff;
        border-radius: 12px;
        padding: 1.5rem;
        margin: 1rem 0;
        color: #0050b3;
    }
    
    .api-status {
        background: #f6ffed;
        border: 1px solid #52c41a;
        border-radius: 8px;
        padding: 1rem;
        margin: 1rem 0;
        color: #389e0d;
    }
    """
    
    with gr.Blocks(css=css, title="HunyuanVideo-Foley Smart Demo") as app:
        
        # Header
        gr.HTML("""
        <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 20px; margin-bottom: 2rem; color: white;">
            <h1>🎵 HunyuanVideo-Foley</h1>
            <p>智能演示版 - 真实工作流程体验</p>
        </div>
        """)
        
        # Smart Notice
        gr.HTML("""
        <div class="smart-notice">
            <strong>🧠 智能演示模式:</strong> 
            <br>• 自动检测可用API服务
            <br>• 根据文本内容生成对应音效类型  
            <br>• 完整展示AI音频生成工作流程
            <br>• <strong>支持</strong>: 脚步声、雨声、风声、车辆声等多种音效
        </div>
        """)
        
        with gr.Row():
            # Input section
            with gr.Column(scale=1):
                gr.Markdown("### 📹 视频输入")
                
                video_input = gr.Video(
                    label="上传视频文件"
                )
                
                text_input = gr.Textbox(
                    label="🎯 音频描述",
                    placeholder="例如：footsteps on wood floor, rain on leaves, wind through trees, car engine",
                    lines=3,
                    value="footsteps on the ground"
                )
                
                with gr.Row():
                    guidance_scale = gr.Slider(
                        minimum=1.0,
                        maximum=10.0,
                        value=4.5,
                        step=0.1,
                        label="🎚️ CFG Scale"
                    )
                    
                    inference_steps = gr.Slider(
                        minimum=10,
                        maximum=100,
                        value=50,
                        step=5,
                        label="⚡ 推理步数"
                    )
                    
                    sample_nums = gr.Slider(
                        minimum=1,
                        maximum=3,
                        value=2,
                        step=1,
                        label="🎲 样本数量"
                    )
                
                generate_btn = gr.Button(
                    "🎵 智能生成音频", 
                    variant="primary"
                )
            
            # Output section
            with gr.Column(scale=1):
                gr.Markdown("### 🎵 生成结果")
                
                audio_output_1 = gr.Audio(label="样本 1", visible=True)
                audio_output_2 = gr.Audio(label="样本 2", visible=False)
                audio_output_3 = gr.Audio(label="样本 3", visible=False)
                
                status_output = gr.Textbox(
                    label="处理状态",
                    interactive=False,
                    lines=12,
                    placeholder="等待处理..."
                )
        
        # Examples
        gr.Markdown("### 🌟 推荐提示词")
        gr.HTML("""
        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin: 1rem 0;">
            <div style="padding: 1rem; background: #f8fafc; border-radius: 8px;">
                <strong>脚步声:</strong> footsteps on wooden floor<br>
                <strong>自然音:</strong> rain drops on leaves<br>
                <strong>环境音:</strong> wind through the trees
            </div>
            <div style="padding: 1rem; background: #f8fafc; border-radius: 8px;">
                <strong>机械音:</strong> car engine running<br>
                <strong>动作音:</strong> door opening and closing<br>
                <strong>水声:</strong> water flowing in stream
            </div>
        </div>
        """)
        
        # Event handlers
        def process_smart(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
            audio_files, status_msg = process_video_smart(
                video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
            )
            
            # Prepare outputs
            outputs = [None, None, None]
            for i, audio_file in enumerate(audio_files[:3]):
                outputs[i] = audio_file
            
            return outputs[0], outputs[1], outputs[2], status_msg
        
        def update_visibility(sample_nums):
            sample_nums = int(sample_nums)
            return [
                gr.update(visible=True),  # Sample 1 always visible
                gr.update(visible=sample_nums >= 2),
                gr.update(visible=sample_nums >= 3)
            ]
        
        # Connect events
        sample_nums.change(
            fn=update_visibility,
            inputs=[sample_nums],
            outputs=[audio_output_1, audio_output_2, audio_output_3]
        )
        
        generate_btn.click(
            fn=process_smart,
            inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
            outputs=[audio_output_1, audio_output_2, audio_output_3, status_output]
        )
        
        # Footer
        gr.HTML("""
        <div style="text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem;">
            <p><strong>🧠 智能演示版</strong> - 展示完整的AI音频生成工作流程</p>
            <p>💡 根据不同描述词生成对应类型的音效</p>
            <p>🔗 完整版本: <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">GitHub Repository</a></p>
        </div>
        """)
    
    return app

if __name__ == "__main__":
    # Setup logging
    logger.remove()
    logger.add(lambda msg: print(msg, end=''), level="INFO")
    
    logger.info("启动 HunyuanVideo-Foley 智能演示版...")
    
    # Create and launch app
    app = create_smart_interface()
    
    logger.info("智能演示版就绪 - 支持多种音效类型")
    
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=False,
        show_error=True
    )