Spaces:
Running
Running
File size: 11,840 Bytes
7315716 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 |
import os
import tempfile
import gradio as gr
import torch
import torchaudio
from loguru import logger
from typing import Optional, Tuple
import requests
import json
def create_realistic_demo_audio(video_file, text_prompt: str, duration: float = 5.0) -> str:
"""创建更真实的演示音频"""
sample_rate = 48000
duration_samples = int(duration * sample_rate)
# 创建更复杂的音频信号
t = torch.linspace(0, duration, duration_samples)
# 基础频率基于文本内容
if "footsteps" in text_prompt.lower() or "步" in text_prompt:
# 脚步声:低频节拍
audio = 0.4 * torch.sin(2 * 3.14159 * 2 * t) * torch.exp(-3 * (t % 0.5))
elif "rain" in text_prompt.lower() or "雨" in text_prompt:
# 雨声:白噪声
audio = 0.3 * torch.randn(duration_samples)
elif "wind" in text_prompt.lower() or "风" in text_prompt:
# 风声:低频噪声
audio = 0.3 * torch.sin(2 * 3.14159 * 0.5 * t) + 0.2 * torch.randn(duration_samples)
elif "car" in text_prompt.lower() or "车" in text_prompt:
# 车辆声:混合频率
audio = 0.3 * torch.sin(2 * 3.14159 * 80 * t) + 0.2 * torch.sin(2 * 3.14159 * 120 * t)
else:
# 默认:和谐音调
base_freq = 220 + len(text_prompt) * 5
audio = 0.3 * torch.sin(2 * 3.14159 * base_freq * t)
# 添加泛音
audio += 0.1 * torch.sin(2 * 3.14159 * base_freq * 2 * t)
audio += 0.05 * torch.sin(2 * 3.14159 * base_freq * 3 * t)
# 应用包络以避免突然开始/结束
envelope = torch.ones_like(audio)
fade_samples = int(0.1 * sample_rate) # 0.1秒淡入淡出
envelope[:fade_samples] = torch.linspace(0, 1, fade_samples)
envelope[-fade_samples:] = torch.linspace(1, 0, fade_samples)
audio *= envelope
# 保存到临时文件
temp_dir = tempfile.mkdtemp()
audio_path = os.path.join(temp_dir, "enhanced_demo_audio.wav")
torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate)
return audio_path
def check_real_api_availability():
"""检查真实API的可用性"""
api_status = {
"gradio_client": False,
"hf_inference": False,
"replicate": False
}
# 检查 gradio_client
try:
from gradio_client import Client
# 尝试连接测试
client = Client("tencent/HunyuanVideo-Foley", timeout=5)
api_status["gradio_client"] = True
except:
pass
# 检查 HF Token
hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
if hf_token:
api_status["hf_inference"] = True
# 检查 Replicate
try:
import replicate
if os.environ.get('REPLICATE_API_TOKEN'):
api_status["replicate"] = True
except:
pass
return api_status
def process_video_smart(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[list, str]:
"""智能处理:先尝试真实API,失败则用增强演示"""
if video_file is None:
return [], "❌ 请上传视频文件!"
if text_prompt is None:
text_prompt = "audio sound effects for this video"
# 检查API可用性
api_status = check_real_api_availability()
logger.info(f"API可用性检查: {api_status}")
# 如果有可用的真实API,可以在这里调用
# 目前先用增强的演示版本
try:
logger.info(f"处理视频: {video_file}")
logger.info(f"文本提示: {text_prompt}")
# 生成增强的演示音频
audio_outputs = []
for i in range(min(sample_nums, 3)):
# 为不同样本添加变化
varied_prompt = f"{text_prompt}_variation_{i+1}"
demo_audio = create_realistic_demo_audio(video_file, varied_prompt)
audio_outputs.append(demo_audio)
status_msg = f"""✅ 增强演示版本处理完成!
📹 **视频**: {os.path.basename(video_file) if hasattr(video_file, 'name') else '已上传'}
📝 **提示**: "{text_prompt}"
⚙️ **设置**: CFG={guidance_scale}, 步数={inference_steps}, 样本={sample_nums}
🎵 **生成**: {len(audio_outputs)} 个音频样本
🧠 **智能特性**:
• 根据文本内容选择音频类型
• 脚步声/雨声/风声/车辆声等不同效果
• 48kHz高质量输出
• 自动淡入淡出和包络处理
📊 **API状态检查**:
• Gradio Client: {'✅' if api_status['gradio_client'] else '❌'}
• HF Inference: {'✅' if api_status['hf_inference'] else '❌'}
• Replicate: {'✅' if api_status['replicate'] else '❌'}
💡 **这是增强演示版本,展示真实AI音频的工作流程**
🚀 **完整版本**: https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley"""
return audio_outputs, status_msg
except Exception as e:
logger.error(f"处理失败: {str(e)}")
return [], f"❌ 处理失败: {str(e)}"
def create_smart_interface():
"""创建智能界面"""
css = """
.smart-notice {
background: linear-gradient(135deg, #e8f4fd 0%, #f0f8ff 100%);
border: 2px solid #1890ff;
border-radius: 12px;
padding: 1.5rem;
margin: 1rem 0;
color: #0050b3;
}
.api-status {
background: #f6ffed;
border: 1px solid #52c41a;
border-radius: 8px;
padding: 1rem;
margin: 1rem 0;
color: #389e0d;
}
"""
with gr.Blocks(css=css, title="HunyuanVideo-Foley Smart Demo") as app:
# Header
gr.HTML("""
<div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 20px; margin-bottom: 2rem; color: white;">
<h1>🎵 HunyuanVideo-Foley</h1>
<p>智能演示版 - 真实工作流程体验</p>
</div>
""")
# Smart Notice
gr.HTML("""
<div class="smart-notice">
<strong>🧠 智能演示模式:</strong>
<br>• 自动检测可用API服务
<br>• 根据文本内容生成对应音效类型
<br>• 完整展示AI音频生成工作流程
<br>• <strong>支持</strong>: 脚步声、雨声、风声、车辆声等多种音效
</div>
""")
with gr.Row():
# Input section
with gr.Column(scale=1):
gr.Markdown("### 📹 视频输入")
video_input = gr.Video(
label="上传视频文件"
)
text_input = gr.Textbox(
label="🎯 音频描述",
placeholder="例如:footsteps on wood floor, rain on leaves, wind through trees, car engine",
lines=3,
value="footsteps on the ground"
)
with gr.Row():
guidance_scale = gr.Slider(
minimum=1.0,
maximum=10.0,
value=4.5,
step=0.1,
label="🎚️ CFG Scale"
)
inference_steps = gr.Slider(
minimum=10,
maximum=100,
value=50,
step=5,
label="⚡ 推理步数"
)
sample_nums = gr.Slider(
minimum=1,
maximum=3,
value=2,
step=1,
label="🎲 样本数量"
)
generate_btn = gr.Button(
"🎵 智能生成音频",
variant="primary"
)
# Output section
with gr.Column(scale=1):
gr.Markdown("### 🎵 生成结果")
audio_output_1 = gr.Audio(label="样本 1", visible=True)
audio_output_2 = gr.Audio(label="样本 2", visible=False)
audio_output_3 = gr.Audio(label="样本 3", visible=False)
status_output = gr.Textbox(
label="处理状态",
interactive=False,
lines=12,
placeholder="等待处理..."
)
# Examples
gr.Markdown("### 🌟 推荐提示词")
gr.HTML("""
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin: 1rem 0;">
<div style="padding: 1rem; background: #f8fafc; border-radius: 8px;">
<strong>脚步声:</strong> footsteps on wooden floor<br>
<strong>自然音:</strong> rain drops on leaves<br>
<strong>环境音:</strong> wind through the trees
</div>
<div style="padding: 1rem; background: #f8fafc; border-radius: 8px;">
<strong>机械音:</strong> car engine running<br>
<strong>动作音:</strong> door opening and closing<br>
<strong>水声:</strong> water flowing in stream
</div>
</div>
""")
# Event handlers
def process_smart(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
audio_files, status_msg = process_video_smart(
video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
)
# Prepare outputs
outputs = [None, None, None]
for i, audio_file in enumerate(audio_files[:3]):
outputs[i] = audio_file
return outputs[0], outputs[1], outputs[2], status_msg
def update_visibility(sample_nums):
sample_nums = int(sample_nums)
return [
gr.update(visible=True), # Sample 1 always visible
gr.update(visible=sample_nums >= 2),
gr.update(visible=sample_nums >= 3)
]
# Connect events
sample_nums.change(
fn=update_visibility,
inputs=[sample_nums],
outputs=[audio_output_1, audio_output_2, audio_output_3]
)
generate_btn.click(
fn=process_smart,
inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
outputs=[audio_output_1, audio_output_2, audio_output_3, status_output]
)
# Footer
gr.HTML("""
<div style="text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem;">
<p><strong>🧠 智能演示版</strong> - 展示完整的AI音频生成工作流程</p>
<p>💡 根据不同描述词生成对应类型的音效</p>
<p>🔗 完整版本: <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">GitHub Repository</a></p>
</div>
""")
return app
if __name__ == "__main__":
# Setup logging
logger.remove()
logger.add(lambda msg: print(msg, end=''), level="INFO")
logger.info("启动 HunyuanVideo-Foley 智能演示版...")
# Create and launch app
app = create_smart_interface()
logger.info("智能演示版就绪 - 支持多种音效类型")
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False,
show_error=True
) |