Spaces:

FunAudioLLM
/

Fun-CineForge-Demo

Running on Zero

App Files Files Community

xuan3986 commited on 11 days ago

Commit

eacf3b5

verified ·

1 Parent(s): 152a383

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -54

app.py CHANGED Viewed

@@ -69,22 +69,22 @@ def create_segments_ui():
     segments = []
     accordions = []
     for i in range(MAX_SEGMENTS):
-        with gr.Accordion(f"🎬 配音片段 {i + 1}", open=(i == 0), visible=(i == 0)) as acc:
             accordions.append(acc)
             with gr.Row():
-                text_input = gr.Textbox(label="📝 配音文本内容", placeholder="输入台词...", lines=2, scale=3, elem_id=f"text_{i}")
-                clue_input = gr.Textbox(label="💡 线索描述", placeholder="一位中年男性角色语气沉稳且坚定，流露出对自身忠诚的强烈自信与决心。整体情感是忠贞不渝的承诺和不容置疑的信念。", lines=2, scale=3, elem_id=f"clue_{i}")
             with gr.Row():
-                start_time = gr.Number(label="⏱️ 起始时间 (s)", value=0.0 + i*5, precision=2, scale=2, elem_id=f"start_{i}")
-                end_time = gr.Number(label="⏱️ 终止时间 (s)", value=5.0 + i*5, precision=2, scale=2, elem_id=f"end_{i}")
             with gr.Row():
-                age_input = gr.Dropdown(label="👤 年龄", choices=["儿童", "青年", "中年", "中老年", "老年", "不确定"], value="不确定", scale=2, elem_id=f"age_{i}")
-                gender_input = gr.Dropdown(label="👤 性别", choices=["男", "女", "不确定"], value="不确定", scale=2, elem_id=f"gender_{i}")
             with gr.Row():
-                ref_audio = gr.Audio(label="🎤 参考语音 (可选，默认以视频原声作为参考音频)", sources=["upload"], type="filepath", scale=4,elem_id=f"audio_{i}")
-                load_audio_btn = gr.Button("📂 加载示例音频", size="sm", variant="secondary", scale=1) if i == 0 else None
             with gr.Row():
-                enable_check = gr.Checkbox(label="启用此片段", value=(i == 0), scale=1, elem_id=f"enable_{i}")
         segments.append({
             "accordion": acc, "text": text_input, "clue": clue_input, "start": start_time, "end": end_time,
@@ -95,11 +95,11 @@ def create_segments_ui():
 def add_segment_fn(current_count):
     """点击加号：显示下一个片段，到达上限则禁用按钮"""
     if current_count >= MAX_SEGMENTS:
-        return [current_count] + [gr.update() for _ in range(MAX_SEGMENTS)] + [gr.update(interactive=False, value=f"已达上限 ({MAX_SEGMENTS})")]
     new_count = current_count + 1
     vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
-    btn = gr.update(interactive=(new_count < MAX_SEGMENTS), value="➕新片段")
     return [new_count] + vis + [btn]
 def load_srt_fn(srt_file, current_count):
@@ -111,11 +111,11 @@ def load_srt_fn(srt_file, current_count):
         with open(srt_file, 'r', encoding='utf-8-sig') as f:
             content = f.read()
     except Exception as e:
-        gr.Warning(f"读取 SRT 文件失败: {e}")
         return [current_count] + empty_fields + empty_vis + [gr.update()]
     parsed = parse_srt_content(content)
     if not parsed:
-        print(" 未解析到有效字幕，请检查 SRT 格式")
         return [current_count] + empty_fields + empty_vis + [gr.update()]
     updates = []
     for i in range(MAX_SEGMENTS):
@@ -134,24 +134,24 @@ def load_srt_fn(srt_file, current_count):
     vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
     btn = gr.update(interactive=(new_count < MAX_SEGMENTS))
     if len(parsed) > MAX_SEGMENTS:
-        gr.Warning(f"SRT 包含 {len(parsed)} 个片段，已截取前 {MAX_SEGMENTS} 条")
     return [new_count] + updates + vis + [btn]
 def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
     """主推理流程"""
     if not video_file:
-        return None, "❌ 请上传视频文件"
     video_duration = get_video_duration(video_file)
     if video_duration <= 0:
-        return None, "❌ 无法获取视频时长，请检查视频文件"
     if os.path.exists(TEMP_DIR):
         try:
             shutil.rmtree(TEMP_DIR)
         except Exception as e:
-            return None, f"❌ 清空临时目录失败：{e}"
     os.makedirs(TEMP_DIR, exist_ok=True)
     # 解析 segment_inputs
@@ -172,51 +172,51 @@ def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
         errors = validate_timestamps(start, end, video_duration)
         if errors:
-            return None, f"❌ 片段 {i+1} 时间戳错误：\n" + "\n".join(errors)
         data = {
             "text": str(text).strip(),
             "clue": str(clue) if clue else "",
             "start": float(start) if start else 0.0,
             "end": float(end) if end else 0.0,
-            "age": str(age) if age else "不确定",
-            "gender": str(gender) if gender else "不确定",
             "ref_audio": str(ref_audio) if ref_audio else ""
         }
         segments_data.append(data)
     if not segments_data:
-        return None, "❌ 有效片段数据为空，请启用并填写至少一个片段"
     try:
-        progress(0.1, desc="📋 预处理视频，生成 JSONL 数据...")
         frontend = init_frontend_models()
         jsonl_path, jsonl_items = generate_jsonl_data(frontend, video_file, segments_data, TEMP_DIR, video_duration)
-        report_lines = [f"✅ 任务完成！共生成 **{len(jsonl_items)}** 个片段数据。\n", "详细 JSONL 数据预览：**", "=" * 40]
         for idx, item in enumerate(jsonl_items):
-            report_lines.extend([f"\n---片段 #{idx + 1} ---", json.dumps(item, ensure_ascii=False, indent=2), "-" * 40])
         full_report = "\n".join(report_lines)
-        progress(0.3, desc="🔄 FunCineForge 模型加载中...")
         eng = init_engine()
         if eng and jsonl_items:
             try:
-                progress(0.5, desc="🚀 FunCineForge 模型推理中...")
                 eng.inference(jsonl_path)
-                progress(0.8, desc="🎵 正在将配音语音粘贴回静音视频...")
                 output_wav_dir = os.path.join(TEMP_DIR, "wav")
                 final_video_path = os.path.join(TEMP_DIR, "dubbed_video.mp4")
                 if not os.path.exists(output_wav_dir):
-                    return None, f"⚠️ 未找到音频输出目录：{output_wav_dir}"
                 wav_files = sorted([f for f in os.listdir(output_wav_dir) if f.endswith('.wav')])
                 if not wav_files:
-                    return None, f"⚠️ 未生成任何音频文件：{output_wav_dir}"
                 time_mapping = {}
                 for item in jsonl_items:
@@ -253,22 +253,22 @@ def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
                 if 'final_audio' in locals(): final_audio.close()
                 final_clip.close()
-                progress(1.0, desc="✅ 配音完成")
                 return final_video_path, full_report
             except Exception as e:
                 import traceback; traceback.print_exc()
                 if "index out of range" in str(e):
-                    return None, f"⚠️ 模型推理失败。错误：{str(e)}，建议补齐输入的线索描述和说话人属性"
                 else:
-                    return None, f"⚠️ 模型推理失败。错误：{str(e)}"
         else:
             time.sleep(1)
-            progress(1.0, desc="模拟完成")
             return video_file, full_report
     except Exception as e:
         import traceback; traceback.print_exc()
-        return None, f"❌ 发生错误：{str(e)}"
 # ==================== 主程序 ====================
@@ -276,7 +276,7 @@ def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
 def main():
     os.makedirs(TEMP_DIR, exist_ok=True)
     with gr.Blocks(
-        title="Fun-CineForge 影视配音平台",
         theme=gr.themes.Soft(),
         css="""
         .segment-accordion { margin: 10px 0; }
@@ -288,39 +288,39 @@ def main():
         gr.Markdown("""
         # 🎬 Fun-CineForge
-        **工作流程：** 上传短视频 → 配音片段信息（或上传 .srt 字幕文件） → 上传参考音色（可选） → 预处理、模型加载和推理 → 输出配音视频
         """)
         with gr.Row():
             with gr.Column(scale=1):
-                video_input = gr.Video(label="上传视频", sources=["upload"])
-                load_video_btn = gr.Button("📂 加载示例视频", variant="secondary", size="sm")
-                srt_input = gr.UploadButton("上传 SRT 字幕", file_types=[".srt"], size="sm", variant="secondary")
                 # with gr.Row(elem_classes=["srt-compact"]):
                 #     srt_input = gr.File(label="上传 SRT 字幕", file_types=[".srt"], height="auto")
-                gr.Markdown("### 🎛️ 配音片段配置")
                 segments, accordions = create_segments_ui()
                 seg_count_state = gr.State(1)  #🔑记录当前可见片段数
-                add_segment_btn = gr.Button("➕添加新片段", size="sm", variant="secondary")
-                submit_btn = gr.Button("🚀 开始生成配音", variant="stop", size="lg")
             with gr.Column(scale=1):
-                video_output = gr.Video(label="📺 配音后视频", autoplay=True)
-                status_text = gr.Textbox(label="结果状态", interactive=False, lines=2)
                 gr.Markdown("""
-                ### 📝 使用说明
-                | 字段 | 说明 |
                 |------|------|
-                | 配音文本 | 该片段台词内容（支持中/英） |
-                | 线索描述 | 请参考样例格式，阐述配音要求，重点描述说话人的性别年龄、语气和情感 |
-                | 时间戳 | 起止时间戳 (可精确到毫秒)，模型对时间戳敏感，建议紧邻有声区间。时长 ≤30s/片段 |
-                | 年龄/性别 | 说话人属性选项 |
-                | 参考语音 | 音色克隆参考 (可选) |
-                **⚠️ 注意：** 确保每个片段的时间戳不重叠，且时间戳不超过视频总时长。模型会根据片段的时间长度进行强制时间对���，弱监督对齐唇部运动。
                 """)
         # ==================== 事件绑定 ====================

     segments = []
     accordions = []
     for i in range(MAX_SEGMENTS):
+        with gr.Accordion(f"🎬 Dubbing clip {i + 1}", open=(i == 0), visible=(i == 0)) as acc:
             accordions.append(acc)
             with gr.Row():
+                text_input = gr.Textbox(label="📝 Dubbing script", placeholder="Please enter the script...", lines=2, scale=3, elem_id=f"text_{i}")
+                clue_input = gr.Textbox(label="💡 Clue description", placeholder="一位中年男性角色语气沉稳且坚定，流露出对自身忠诚的强烈自信与决心。整体情感是忠贞不渝的承诺和不容置疑的信念。", lines=2, scale=3, elem_id=f"clue_{i}")
             with gr.Row():
+                start_time = gr.Number(label="⏱️ Start timestamp (s)", value=0.0 + i*5, precision=2, scale=2, elem_id=f"start_{i}")
+                end_time = gr.Number(label="⏱️ End timestamp (s)", value=5.0 + i*5, precision=2, scale=2, elem_id=f"end_{i}")
             with gr.Row():
+                age_input = gr.Dropdown(label="👤 Age", choices=["child", "teenager", "adult", "middle-aged", "elderly", "unknown"], value="unknown", scale=2, elem_id=f"age_{i}")
+                gender_input = gr.Dropdown(label="👤 Gender", choices=["male", "female", "unknown"], value="unknown", scale=2, elem_id=f"gender_{i}")
             with gr.Row():
+                ref_audio = gr.Audio(label="🎤 Reference audio (optional, the video's audio is used as the reference audio by default).", sources=["upload"], type="filepath", scale=4,elem_id=f"audio_{i}")
+                load_audio_btn = gr.Button("📂 Load sample audio", size="sm", variant="secondary", scale=1) if i == 0 else None
             with gr.Row():
+                enable_check = gr.Checkbox(label="Enable this clip", value=(i == 0), scale=1, elem_id=f"enable_{i}")
         segments.append({
             "accordion": acc, "text": text_input, "clue": clue_input, "start": start_time, "end": end_time,
 def add_segment_fn(current_count):
     """点击加号：显示下一个片段，到达上限则禁用按钮"""
     if current_count >= MAX_SEGMENTS:
+        return [current_count] + [gr.update() for _ in range(MAX_SEGMENTS)] + [gr.update(interactive=False, value=f"The limit has been reached. ({MAX_SEGMENTS})")]
     new_count = current_count + 1
     vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
+    btn = gr.update(interactive=(new_count < MAX_SEGMENTS), value="➕ New clip")
     return [new_count] + vis + [btn]
 def load_srt_fn(srt_file, current_count):
         with open(srt_file, 'r', encoding='utf-8-sig') as f:
             content = f.read()
     except Exception as e:
+        gr.Warning(f"Failed to read SRT file: {e}")
         return [current_count] + empty_fields + empty_vis + [gr.update()]
     parsed = parse_srt_content(content)
     if not parsed:
+        print(" No valid subtitles were parsed. Please check the SRT format.")
         return [current_count] + empty_fields + empty_vis + [gr.update()]
     updates = []
     for i in range(MAX_SEGMENTS):
     vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
     btn = gr.update(interactive=(new_count < MAX_SEGMENTS))
     if len(parsed) > MAX_SEGMENTS:
+        gr.Warning(f"The SRT contains {len(parsed)} fragments, of which the first {MAX_SEGMENTS} have been truncated.")
     return [new_count] + updates + vis + [btn]
 def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
     """主推理流程"""
     if not video_file:
+        return None, "❌ Please upload the video file."
     video_duration = get_video_duration(video_file)
     if video_duration <= 0:
+        return None, "❌ Unable to obtain video duration, please check the video file."
     if os.path.exists(TEMP_DIR):
         try:
             shutil.rmtree(TEMP_DIR)
         except Exception as e:
+            return None, f"❌ Failed to clear temporary directory：{e}"
     os.makedirs(TEMP_DIR, exist_ok=True)
     # 解析 segment_inputs
         errors = validate_timestamps(start, end, video_duration)
         if errors:
+            return None, f"❌ Clip {i+1} timestamp error：\n" + "\n".join(errors)
         data = {
             "text": str(text).strip(),
             "clue": str(clue) if clue else "",
             "start": float(start) if start else 0.0,
             "end": float(end) if end else 0.0,
+            "age": str(age) if age else "unknown",
+            "gender": str(gender) if gender else "unknown",
             "ref_audio": str(ref_audio) if ref_audio else ""
         }
         segments_data.append(data)
     if not segments_data:
+        return None, "❌ The valid clip data is empty. Please enable and fill in at least one clip."
     try:
+        progress(0.1, desc="📋 Preprocess the video to generate JSONL data...")
         frontend = init_frontend_models()
         jsonl_path, jsonl_items = generate_jsonl_data(frontend, video_file, segments_data, TEMP_DIR, video_duration)
+        report_lines = [f"✅ Task completed! A total of **{len(jsonl_items)}** data fragments were generated.\n", "Detailed JSONL data preview：**", "=" * 40]
         for idx, item in enumerate(jsonl_items):
+            report_lines.extend([f"\n---Clip #{idx + 1} ---", json.dumps(item, ensure_ascii=False, indent=2), "-" * 40])
         full_report = "\n".join(report_lines)
+        progress(0.3, desc="🔄 FunCineForge dubbing model loading...")
         eng = init_engine()
         if eng and jsonl_items:
             try:
+                progress(0.5, desc="🚀 FunCineForge dubbing model inference...")
                 eng.inference(jsonl_path)
+                progress(0.8, desc="🎵 Pasting the voiceover back into the muted video...")
                 output_wav_dir = os.path.join(TEMP_DIR, "wav")
                 final_video_path = os.path.join(TEMP_DIR, "dubbed_video.mp4")
                 if not os.path.exists(output_wav_dir):
+                    return None, f"⚠️ Audio output directory not found：{output_wav_dir}"
                 wav_files = sorted([f for f in os.listdir(output_wav_dir) if f.endswith('.wav')])
                 if not wav_files:
+                    return None, f"⚠️ No audio files were generated：{output_wav_dir}"
                 time_mapping = {}
                 for item in jsonl_items:
                 if 'final_audio' in locals(): final_audio.close()
                 final_clip.close()
+                progress(1.0, desc="✅ Dubbing complete")
                 return final_video_path, full_report
             except Exception as e:
                 import traceback; traceback.print_exc()
                 if "index out of range" in str(e):
+                    return None, f"⚠️ Model inference failed. Error: {str(e)}. It is recommended to complete the input clue description and speaker attributes."
                 else:
+                    return None, f"⚠️ Model inference failed. Error: {str(e)}"
         else:
             time.sleep(1)
+            progress(1.0, desc="Simulation complete")
             return video_file, full_report
     except Exception as e:
         import traceback; traceback.print_exc()
+        return None, f"❌ Error: {str(e)}"
 # ==================== 主程序 ====================
 def main():
     os.makedirs(TEMP_DIR, exist_ok=True)
     with gr.Blocks(
+        title="Fun-CineForge-Demo",
         theme=gr.themes.Soft(),
         css="""
         .segment-accordion { margin: 10px 0; }
         gr.Markdown("""
         # 🎬 Fun-CineForge
+        **Workflow:** Upload short video → Add clip information (or upload .srt subtitle file) → Upload reference audio (optional) → Preprocessing, model loading, and inference → Output dubbed video
         """)
         with gr.Row():
             with gr.Column(scale=1):
+                video_input = gr.Video(label="Upload video", sources=["upload"])
+                load_video_btn = gr.Button("📂 Load sample video", variant="secondary", size="sm")
+                srt_input = gr.UploadButton("Upload SRT subtitles", file_types=[".srt"], size="sm", variant="secondary")
                 # with gr.Row(elem_classes=["srt-compact"]):
                 #     srt_input = gr.File(label="上传 SRT 字幕", file_types=[".srt"], height="auto")
+                gr.Markdown("### 🎛️ Dubbing clip configuration")
                 segments, accordions = create_segments_ui()
                 seg_count_state = gr.State(1)  #🔑记录当前可见片段数
+                add_segment_btn = gr.Button("➕Add new clip", size="sm", variant="secondary")
+                submit_btn = gr.Button("🚀 Start dubbing", variant="stop", size="lg")
             with gr.Column(scale=1):
+                video_output = gr.Video(label="📺 Dubbed video", autoplay=True)
+                status_text = gr.Textbox(label="Result status", interactive=False, lines=2)
                 gr.Markdown("""
+                ### 📝 Instructions for use
+                | Fields | Descriptions |
                 |------|------|
+                | Dubbing script | The content of this clip (supports Chinese/English) |
+                | Clue description | Please refer to the sample format to explain the dubbing requirements, focusing on describing the speaker's gender, age, tone, and emotion |
+                | Timestamps | Start and end timestamps (accurate to milliseconds). The model is sensitive to timestamps; it is recommended to use timestamps adjacent to the audio clip. Duration ≤ 30s/clip |
+                | Age/Gender | Speaker attribute options |
+                | Reference audio | Voice cloning reference (Optional) |
+                **⚠️ Note:** Ensure that the timestamps of each clip don't overlap and don't exceed the video duration. The model will perform time alignment based on the timestamps, with weak supervision aligning lip movements.
                 """)
         # ==================== 事件绑定 ====================