File size: 11,621 Bytes
21d1989
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import os
import tempfile
import gradio as gr
import requests
import json
from loguru import logger
from typing import Optional, Tuple
import base64
import time

def call_gradio_client_api(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
    """调用官方Hugging Face Space的API"""
    try:
        from gradio_client import Client
        
        logger.info("连接到官方 HunyuanVideo-Foley Space...")
        
        # 连接到官方Space
        client = Client("tencent/HunyuanVideo-Foley")
        
        logger.info("发送推理请求...")
        
        # 调用推理函数
        result = client.predict(
            video_file,  # 视频文件
            text_prompt,  # 文本提示
            guidance_scale,  # CFG scale
            inference_steps,  # 推理步数
            sample_nums,  # 样本数量
            api_name="/infer_single_video"  # API端点名称
        )
        
        return result, "✅ 成功通过官方API生成音频!"
        
    except Exception as e:
        error_msg = str(e)
        logger.error(f"Gradio Client API 调用失败: {error_msg}")
        
        if "not found" in error_msg.lower():
            return None, "❌ 官方Space的API端点未找到,可能接口已更改"
        elif "connection" in error_msg.lower():
            return None, "❌ 无法连接到官方Space,请检查网络"
        elif "queue" in error_msg.lower():
            return None, "⏳ 官方Space繁忙,请稍后重试"
        else:
            return None, f"❌ API调用错误: {error_msg}"

def call_huggingface_inference_api(video_file, text_prompt):
    """调用Hugging Face Inference API"""
    try:
        logger.info("尝试Hugging Face Inference API...")
        
        API_URL = "https://api-inference.huggingface.co/models/tencent/HunyuanVideo-Foley"
        
        # 读取视频文件
        with open(video_file, "rb") as f:
            video_data = f.read()
        
        # 准备请求数据
        headers = {
            "Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}",
        }
        
        # 发送请求
        response = requests.post(
            API_URL,
            headers=headers,
            json={"inputs": {"video": base64.b64encode(video_data).decode(), "text": text_prompt}},
            timeout=300
        )
        
        if response.status_code == 200:
            # 保存结果
            temp_dir = tempfile.mkdtemp()
            audio_path = os.path.join(temp_dir, "generated_audio.wav")
            with open(audio_path, 'wb') as f:
                f.write(response.content)
            return [audio_path], "✅ 通过Hugging Face API生成成功!"
        else:
            logger.error(f"HF API错误: {response.status_code}")
            return None, f"❌ Hugging Face API返回错误: {response.status_code}"
            
    except Exception as e:
        logger.error(f"HF API调用失败: {str(e)}")
        return None, f"❌ Hugging Face API调用失败: {str(e)}"

def try_alternative_apis(video_file, text_prompt):
    """尝试其他可能的API服务"""
    
    # 1. 尝试通过公开的demo接口
    try:
        logger.info("尝试demo接口...")
        
        # 这里可以尝试其他公开的API服务
        # 比如Replicate、RunPod等
        
        return None, "❌ 暂无可用的替代API服务"
        
    except Exception as e:
        return None, f"❌ 替代API调用失败: {str(e)}"

def smart_api_inference(video_file, text_prompt, guidance_scale=4.5, inference_steps=50, sample_nums=1):
    """智能API推理 - 尝试多种API调用方式"""
    
    if video_file is None:
        return [], "❌ 请上传视频文件!"
    
    if not text_prompt:
        text_prompt = "audio for this video"
    
    logger.info(f"开始API推理: {video_file}")
    logger.info(f"文本提示: {text_prompt}")
    
    status_updates = []
    
    # 方法1: 尝试Gradio Client (最可能成功)
    status_updates.append("🔄 尝试连接官方Space API...")
    try:
        result, status = call_gradio_client_api(
            video_file, text_prompt, guidance_scale, inference_steps, sample_nums
        )
        if result:
            return result, "\n".join(status_updates + [status])
        status_updates.append(status)
    except ImportError:
        status_updates.append("⚠️ gradio_client未安装,跳过官方API调用")
    
    # 方法2: 尝试Hugging Face Inference API
    status_updates.append("🔄 尝试Hugging Face Inference API...")
    result, status = call_huggingface_inference_api(video_file, text_prompt)
    if result:
        return result, "\n".join(status_updates + [status])
    status_updates.append(status)
    
    # 方法3: 尝试其他API
    status_updates.append("🔄 尝试替代API服务...")
    result, status = try_alternative_apis(video_file, text_prompt)
    status_updates.append(status)
    
    # 所有方法都失败了
    final_message = "\n".join(status_updates + [
        "",
        "💡 **解决方案建议:**",
        "• 安装 gradio_client: pip install gradio_client",
        "• 配置 HF_TOKEN 环境变量",
        "• 等待官方Space负载降低",
        "• 本地运行完整模型(需24GB+ RAM)",
        "",
        "🔗 **官方Space**: https://huggingface.co/spaces/tencent/HunyuanVideo-Foley"
    ])
    
    return [], final_message

def create_real_api_interface():
    """创建真实API调用界面"""
    
    css = """
    .api-status {
        background: #f0f8ff;
        border: 2px solid #4169e1;
        border-radius: 10px;
        padding: 1rem;
        margin: 1rem 0;
        color: #191970;
    }
    """
    
    with gr.Blocks(css=css, title="HunyuanVideo-Foley API Client") as app:
        
        # Header
        gr.HTML("""
        <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 20px; margin-bottom: 2rem; color: white;">
            <h1>🎵 HunyuanVideo-Foley</h1>
            <p>API客户端 - 调用真实模型推理</p>
        </div>
        """)
        
        # API Status Notice
        gr.HTML("""
        <div class="api-status">
            <strong>🌐 真实API调用模式:</strong> 这个版本会通过API调用真实的HunyuanVideo-Foley模型进行推理。
            <br><strong>优点:</strong> 真实AI音频生成,无需本地大内存
            <br><strong>缺点:</strong> 依赖外部服务可用性,可能需要等待队列
        </div>
        """)
        
        with gr.Row():
            # 输入区域
            with gr.Column(scale=1):
                gr.Markdown("### 📹 视频输入")
                
                video_input = gr.Video(
                    label="上传视频",
                    info="支持MP4、AVI、MOV等格式"
                )
                
                text_input = gr.Textbox(
                    label="🎯 音频描述",
                    placeholder="描述你想要的音频效果,例如:脚步声、雨声、车辆行驶等",
                    lines=3,
                    value="audio sound effects for this video"
                )
                
                with gr.Row():
                    guidance_scale = gr.Slider(
                        minimum=1.0,
                        maximum=10.0,
                        value=4.5,
                        step=0.1,
                        label="🎚️ CFG Scale"
                    )
                    
                    inference_steps = gr.Slider(
                        minimum=10,
                        maximum=100,
                        value=50,
                        step=5,
                        label="⚡ 推理步数"
                    )
                    
                    sample_nums = gr.Slider(
                        minimum=1,
                        maximum=6,
                        value=1,
                        step=1,
                        label="🎲 样本数量"
                    )
                
                generate_btn = gr.Button(
                    "🎵 调用API生成音频", 
                    variant="primary",
                    size="lg"
                )
            
            # 输出区域
            with gr.Column(scale=1):
                gr.Markdown("### 🎵 生成结果")
                
                audio_outputs = []
                for i in range(6):
                    audio_output = gr.Audio(
                        label=f"样本 {i+1}",
                        visible=(i == 0)  # 只显示第一个
                    )
                    audio_outputs.append(audio_output)
                
                status_output = gr.Textbox(
                    label="API状态",
                    interactive=False,
                    lines=10,
                    placeholder="等待API调用..."
                )
        
        # 事件处理
        def process_with_api(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
            # 调用API推理
            results, status_msg = smart_api_inference(
                video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
            )
            
            # 准备输出
            outputs = [None] * 6
            visibilities = [False] * 6
            
            if results and isinstance(results, list):
                for i, result in enumerate(results[:6]):
                    outputs[i] = result
                    visibilities[i] = True
            
            return outputs + visibilities + [status_msg]
        
        # 动态显示样本数量
        def update_visibility(sample_nums):
            sample_nums = int(sample_nums)
            return [gr.update(visible=(i < sample_nums)) for i in range(6)]
        
        # 连接事件
        sample_nums.change(
            fn=update_visibility,
            inputs=[sample_nums],
            outputs=audio_outputs
        )
        
        generate_btn.click(
            fn=process_with_api,
            inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
            outputs=audio_outputs + [gr.update(visible=(i < 6)) for i in range(6)] + [status_output]
        )
        
        # Footer
        gr.HTML("""
        <div style="text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem;">
            <p><strong>📡 API调用版本</strong> - 通过网络调用真实模型进行推理</p>
            <p>🔗 官方Space: <a href="https://huggingface.co/spaces/tencent/HunyuanVideo-Foley" target="_blank">tencent/HunyuanVideo-Foley</a></p>
            <p>⚠️ 需要安装: <code>pip install gradio_client</code></p>
        </div>
        """)
    
    return app

if __name__ == "__main__":
    # 设置日志
    logger.remove()
    logger.add(lambda msg: print(msg, end=''), level="INFO")
    
    logger.info("启动 HunyuanVideo-Foley API 客户端...")
    
    # 检查依赖
    try:
        import gradio_client
        logger.info("✅ gradio_client 已安装")
    except ImportError:
        logger.warning("⚠️ gradio_client 未安装,API调用功能可能受限")
    
    # 创建并启动应用
    app = create_real_api_interface()
    
    logger.info("API客户端就绪,准备调用真实模型...")
    
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=False,
        show_error=True
    )