xuan3986 commited on
Commit
eacf3b5
·
verified ·
1 Parent(s): 152a383

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -54
app.py CHANGED
@@ -69,22 +69,22 @@ def create_segments_ui():
69
  segments = []
70
  accordions = []
71
  for i in range(MAX_SEGMENTS):
72
- with gr.Accordion(f"🎬 配音片段 {i + 1}", open=(i == 0), visible=(i == 0)) as acc:
73
  accordions.append(acc)
74
  with gr.Row():
75
- text_input = gr.Textbox(label="📝 配音文本内容", placeholder="输入台词...", lines=2, scale=3, elem_id=f"text_{i}")
76
- clue_input = gr.Textbox(label="💡 线索描述", placeholder="一位中年男性角色语气沉稳且坚定,流露出对自身忠诚的强烈自信与决心。整体情感是忠贞不渝的承诺和不容置疑的信念。", lines=2, scale=3, elem_id=f"clue_{i}")
77
  with gr.Row():
78
- start_time = gr.Number(label="⏱️ 起始时间 (s)", value=0.0 + i*5, precision=2, scale=2, elem_id=f"start_{i}")
79
- end_time = gr.Number(label="⏱️ 终止时间 (s)", value=5.0 + i*5, precision=2, scale=2, elem_id=f"end_{i}")
80
  with gr.Row():
81
- age_input = gr.Dropdown(label="👤 年龄", choices=["儿童", "青年", "中年", "中老年", "老年", "不确定"], value="不确定", scale=2, elem_id=f"age_{i}")
82
- gender_input = gr.Dropdown(label="👤 性别", choices=["", "", "不确定"], value="不确定", scale=2, elem_id=f"gender_{i}")
83
  with gr.Row():
84
- ref_audio = gr.Audio(label="🎤 参考语音 (可选,默认以视频原声作为参考音频)", sources=["upload"], type="filepath", scale=4,elem_id=f"audio_{i}")
85
- load_audio_btn = gr.Button("📂 加载示例音频", size="sm", variant="secondary", scale=1) if i == 0 else None
86
  with gr.Row():
87
- enable_check = gr.Checkbox(label="启用此片段", value=(i == 0), scale=1, elem_id=f"enable_{i}")
88
 
89
  segments.append({
90
  "accordion": acc, "text": text_input, "clue": clue_input, "start": start_time, "end": end_time,
@@ -95,11 +95,11 @@ def create_segments_ui():
95
  def add_segment_fn(current_count):
96
  """点击加号:显示下一个片段,到达上限则禁用按钮"""
97
  if current_count >= MAX_SEGMENTS:
98
- return [current_count] + [gr.update() for _ in range(MAX_SEGMENTS)] + [gr.update(interactive=False, value=f"已达上限 ({MAX_SEGMENTS})")]
99
 
100
  new_count = current_count + 1
101
  vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
102
- btn = gr.update(interactive=(new_count < MAX_SEGMENTS), value="➕新片段")
103
  return [new_count] + vis + [btn]
104
 
105
  def load_srt_fn(srt_file, current_count):
@@ -111,11 +111,11 @@ def load_srt_fn(srt_file, current_count):
111
  with open(srt_file, 'r', encoding='utf-8-sig') as f:
112
  content = f.read()
113
  except Exception as e:
114
- gr.Warning(f"读取 SRT 文件失败: {e}")
115
  return [current_count] + empty_fields + empty_vis + [gr.update()]
116
  parsed = parse_srt_content(content)
117
  if not parsed:
118
- print(" 未解析到有效字幕,请检查 SRT 格式")
119
  return [current_count] + empty_fields + empty_vis + [gr.update()]
120
  updates = []
121
  for i in range(MAX_SEGMENTS):
@@ -134,24 +134,24 @@ def load_srt_fn(srt_file, current_count):
134
  vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
135
  btn = gr.update(interactive=(new_count < MAX_SEGMENTS))
136
  if len(parsed) > MAX_SEGMENTS:
137
- gr.Warning(f"SRT 包含 {len(parsed)} 个片段,已截取前 {MAX_SEGMENTS} ")
138
 
139
  return [new_count] + updates + vis + [btn]
140
 
141
  def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
142
  """主推理流程"""
143
  if not video_file:
144
- return None, "❌ 请上传视频文件"
145
 
146
  video_duration = get_video_duration(video_file)
147
  if video_duration <= 0:
148
- return None, "❌ 无法获取视频时长,请检查视频文件"
149
 
150
  if os.path.exists(TEMP_DIR):
151
  try:
152
  shutil.rmtree(TEMP_DIR)
153
  except Exception as e:
154
- return None, f"❌ 清空临时目录失败:{e}"
155
  os.makedirs(TEMP_DIR, exist_ok=True)
156
 
157
  # 解析 segment_inputs
@@ -172,51 +172,51 @@ def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
172
 
173
  errors = validate_timestamps(start, end, video_duration)
174
  if errors:
175
- return None, f"❌ 片段 {i+1} 时间戳错误:\n" + "\n".join(errors)
176
 
177
  data = {
178
  "text": str(text).strip(),
179
  "clue": str(clue) if clue else "",
180
  "start": float(start) if start else 0.0,
181
  "end": float(end) if end else 0.0,
182
- "age": str(age) if age else "不确定",
183
- "gender": str(gender) if gender else "不确定",
184
  "ref_audio": str(ref_audio) if ref_audio else ""
185
  }
186
 
187
  segments_data.append(data)
188
 
189
  if not segments_data:
190
- return None, "❌ 有效片段数据为空,请启用并填写至少一个片段"
191
 
192
  try:
193
- progress(0.1, desc="📋 预处理视频,生成 JSONL 数据...")
194
  frontend = init_frontend_models()
195
  jsonl_path, jsonl_items = generate_jsonl_data(frontend, video_file, segments_data, TEMP_DIR, video_duration)
196
- report_lines = [f"✅ 任务完成!共生成 **{len(jsonl_items)}** 个片段数据。\n", "详细 JSONL 数据预览:**", "=" * 40]
197
  for idx, item in enumerate(jsonl_items):
198
- report_lines.extend([f"\n---片段 #{idx + 1} ---", json.dumps(item, ensure_ascii=False, indent=2), "-" * 40])
199
  full_report = "\n".join(report_lines)
200
 
201
- progress(0.3, desc="🔄 FunCineForge 模型加载中...")
202
 
203
  eng = init_engine()
204
  if eng and jsonl_items:
205
  try:
206
- progress(0.5, desc="🚀 FunCineForge 模型推理中...")
207
  eng.inference(jsonl_path)
208
 
209
- progress(0.8, desc="🎵 正在将配音语音粘贴回静音视频...")
210
 
211
  output_wav_dir = os.path.join(TEMP_DIR, "wav")
212
  final_video_path = os.path.join(TEMP_DIR, "dubbed_video.mp4")
213
 
214
  if not os.path.exists(output_wav_dir):
215
- return None, f"⚠️ 未找到音频输出目录:{output_wav_dir}"
216
 
217
  wav_files = sorted([f for f in os.listdir(output_wav_dir) if f.endswith('.wav')])
218
  if not wav_files:
219
- return None, f"⚠️ 未生成任何音频文件:{output_wav_dir}"
220
 
221
  time_mapping = {}
222
  for item in jsonl_items:
@@ -253,22 +253,22 @@ def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
253
  if 'final_audio' in locals(): final_audio.close()
254
  final_clip.close()
255
 
256
- progress(1.0, desc="✅ 配音完成")
257
  return final_video_path, full_report
258
  except Exception as e:
259
  import traceback; traceback.print_exc()
260
  if "index out of range" in str(e):
261
- return None, f"⚠️ 模型推理失败。错误:{str(e)},建议补齐输入的线索描述和说话人属性"
262
  else:
263
- return None, f"⚠️ 模型推理失败。错误:{str(e)}"
264
  else:
265
  time.sleep(1)
266
- progress(1.0, desc="模拟完成")
267
  return video_file, full_report
268
 
269
  except Exception as e:
270
  import traceback; traceback.print_exc()
271
- return None, f"❌ 发生错误:{str(e)}"
272
 
273
 
274
  # ==================== 主程序 ====================
@@ -276,7 +276,7 @@ def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
276
  def main():
277
  os.makedirs(TEMP_DIR, exist_ok=True)
278
  with gr.Blocks(
279
- title="Fun-CineForge 影视配音平台",
280
  theme=gr.themes.Soft(),
281
  css="""
282
  .segment-accordion { margin: 10px 0; }
@@ -288,39 +288,39 @@ def main():
288
  gr.Markdown("""
289
  # 🎬 Fun-CineForge
290
 
291
- **工作流程:** 上传短视频配音片段信息(或上传 .srt 字幕文件)上传参考音色(可选)预处理、模型加载和推理输出配音视频
292
  """)
293
 
294
  with gr.Row():
295
  with gr.Column(scale=1):
296
- video_input = gr.Video(label="上传视频", sources=["upload"])
297
- load_video_btn = gr.Button("📂 加载示例视频", variant="secondary", size="sm")
298
- srt_input = gr.UploadButton("上传 SRT 字幕", file_types=[".srt"], size="sm", variant="secondary")
299
  # with gr.Row(elem_classes=["srt-compact"]):
300
  # srt_input = gr.File(label="上传 SRT 字幕", file_types=[".srt"], height="auto")
301
- gr.Markdown("### 🎛️ 配音片段配置")
302
 
303
  segments, accordions = create_segments_ui()
304
  seg_count_state = gr.State(1) #🔑记录当前可见片段数
305
- add_segment_btn = gr.Button("➕添加新片段", size="sm", variant="secondary")
306
- submit_btn = gr.Button("🚀 开始生成配音", variant="stop", size="lg")
307
 
308
  with gr.Column(scale=1):
309
- video_output = gr.Video(label="📺 配音后视频", autoplay=True)
310
 
311
- status_text = gr.Textbox(label="结果状态", interactive=False, lines=2)
312
 
313
  gr.Markdown("""
314
- ### 📝 使用说明
315
- | 字段 | 说明 |
316
  |------|------|
317
- | 配音文本 | 该片段台词内容(支持中/英) |
318
- | 线索描述 | 请参考样例格式,阐述配音要求,重点描述说话人的性别年龄、语气和情感 |
319
- | 时间戳 | 起止时间戳 (可精确到毫秒),模型对时间戳敏感,建议紧邻有声区间。时长 ≤30s/片段 |
320
- | 年龄/性别 | 说话人属性选项 |
321
- | 参考语音 | 音色克隆参考 (可选) |
322
-
323
- **⚠️ 注意:** 确保每个片段的时间戳不重叠,且时间戳不超过视频总时长。模型会根据片段的时间长度进行强制时间对���,弱监督对齐唇部运动。
324
  """)
325
 
326
  # ==================== 事件绑定 ====================
 
69
  segments = []
70
  accordions = []
71
  for i in range(MAX_SEGMENTS):
72
+ with gr.Accordion(f"🎬 Dubbing clip {i + 1}", open=(i == 0), visible=(i == 0)) as acc:
73
  accordions.append(acc)
74
  with gr.Row():
75
+ text_input = gr.Textbox(label="📝 Dubbing script", placeholder="Please enter the script...", lines=2, scale=3, elem_id=f"text_{i}")
76
+ clue_input = gr.Textbox(label="💡 Clue description", placeholder="一位中年男性角色语气沉稳且坚定,流露出对自身忠诚的强烈自信与决心。整体情感是忠贞不渝的承诺和不容置疑的信念。", lines=2, scale=3, elem_id=f"clue_{i}")
77
  with gr.Row():
78
+ start_time = gr.Number(label="⏱️ Start timestamp (s)", value=0.0 + i*5, precision=2, scale=2, elem_id=f"start_{i}")
79
+ end_time = gr.Number(label="⏱️ End timestamp (s)", value=5.0 + i*5, precision=2, scale=2, elem_id=f"end_{i}")
80
  with gr.Row():
81
+ age_input = gr.Dropdown(label="👤 Age", choices=["child", "teenager", "adult", "middle-aged", "elderly", "unknown"], value="unknown", scale=2, elem_id=f"age_{i}")
82
+ gender_input = gr.Dropdown(label="👤 Gender", choices=["male", "female", "unknown"], value="unknown", scale=2, elem_id=f"gender_{i}")
83
  with gr.Row():
84
+ ref_audio = gr.Audio(label="🎤 Reference audio (optional, the video's audio is used as the reference audio by default).", sources=["upload"], type="filepath", scale=4,elem_id=f"audio_{i}")
85
+ load_audio_btn = gr.Button("📂 Load sample audio", size="sm", variant="secondary", scale=1) if i == 0 else None
86
  with gr.Row():
87
+ enable_check = gr.Checkbox(label="Enable this clip", value=(i == 0), scale=1, elem_id=f"enable_{i}")
88
 
89
  segments.append({
90
  "accordion": acc, "text": text_input, "clue": clue_input, "start": start_time, "end": end_time,
 
95
  def add_segment_fn(current_count):
96
  """点击加号:显示下一个片段,到达上限则禁用按钮"""
97
  if current_count >= MAX_SEGMENTS:
98
+ return [current_count] + [gr.update() for _ in range(MAX_SEGMENTS)] + [gr.update(interactive=False, value=f"The limit has been reached. ({MAX_SEGMENTS})")]
99
 
100
  new_count = current_count + 1
101
  vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
102
+ btn = gr.update(interactive=(new_count < MAX_SEGMENTS), value="➕ New clip")
103
  return [new_count] + vis + [btn]
104
 
105
  def load_srt_fn(srt_file, current_count):
 
111
  with open(srt_file, 'r', encoding='utf-8-sig') as f:
112
  content = f.read()
113
  except Exception as e:
114
+ gr.Warning(f"Failed to read SRT file: {e}")
115
  return [current_count] + empty_fields + empty_vis + [gr.update()]
116
  parsed = parse_srt_content(content)
117
  if not parsed:
118
+ print(" No valid subtitles were parsed. Please check the SRT format.")
119
  return [current_count] + empty_fields + empty_vis + [gr.update()]
120
  updates = []
121
  for i in range(MAX_SEGMENTS):
 
134
  vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
135
  btn = gr.update(interactive=(new_count < MAX_SEGMENTS))
136
  if len(parsed) > MAX_SEGMENTS:
137
+ gr.Warning(f"The SRT contains {len(parsed)} fragments, of which the first {MAX_SEGMENTS} have been truncated.")
138
 
139
  return [new_count] + updates + vis + [btn]
140
 
141
  def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
142
  """主推理流程"""
143
  if not video_file:
144
+ return None, "❌ Please upload the video file."
145
 
146
  video_duration = get_video_duration(video_file)
147
  if video_duration <= 0:
148
+ return None, "❌ Unable to obtain video duration, please check the video file."
149
 
150
  if os.path.exists(TEMP_DIR):
151
  try:
152
  shutil.rmtree(TEMP_DIR)
153
  except Exception as e:
154
+ return None, f"❌ Failed to clear temporary directory:{e}"
155
  os.makedirs(TEMP_DIR, exist_ok=True)
156
 
157
  # 解析 segment_inputs
 
172
 
173
  errors = validate_timestamps(start, end, video_duration)
174
  if errors:
175
+ return None, f"❌ Clip {i+1} timestamp error:\n" + "\n".join(errors)
176
 
177
  data = {
178
  "text": str(text).strip(),
179
  "clue": str(clue) if clue else "",
180
  "start": float(start) if start else 0.0,
181
  "end": float(end) if end else 0.0,
182
+ "age": str(age) if age else "unknown",
183
+ "gender": str(gender) if gender else "unknown",
184
  "ref_audio": str(ref_audio) if ref_audio else ""
185
  }
186
 
187
  segments_data.append(data)
188
 
189
  if not segments_data:
190
+ return None, "❌ The valid clip data is empty. Please enable and fill in at least one clip."
191
 
192
  try:
193
+ progress(0.1, desc="📋 Preprocess the video to generate JSONL data...")
194
  frontend = init_frontend_models()
195
  jsonl_path, jsonl_items = generate_jsonl_data(frontend, video_file, segments_data, TEMP_DIR, video_duration)
196
+ report_lines = [f"✅ Task completed! A total of **{len(jsonl_items)}** data fragments were generated.\n", "Detailed JSONL data preview:**", "=" * 40]
197
  for idx, item in enumerate(jsonl_items):
198
+ report_lines.extend([f"\n---Clip #{idx + 1} ---", json.dumps(item, ensure_ascii=False, indent=2), "-" * 40])
199
  full_report = "\n".join(report_lines)
200
 
201
+ progress(0.3, desc="🔄 FunCineForge dubbing model loading...")
202
 
203
  eng = init_engine()
204
  if eng and jsonl_items:
205
  try:
206
+ progress(0.5, desc="🚀 FunCineForge dubbing model inference...")
207
  eng.inference(jsonl_path)
208
 
209
+ progress(0.8, desc="🎵 Pasting the voiceover back into the muted video...")
210
 
211
  output_wav_dir = os.path.join(TEMP_DIR, "wav")
212
  final_video_path = os.path.join(TEMP_DIR, "dubbed_video.mp4")
213
 
214
  if not os.path.exists(output_wav_dir):
215
+ return None, f"⚠️ Audio output directory not found:{output_wav_dir}"
216
 
217
  wav_files = sorted([f for f in os.listdir(output_wav_dir) if f.endswith('.wav')])
218
  if not wav_files:
219
+ return None, f"⚠️ No audio files were generated:{output_wav_dir}"
220
 
221
  time_mapping = {}
222
  for item in jsonl_items:
 
253
  if 'final_audio' in locals(): final_audio.close()
254
  final_clip.close()
255
 
256
+ progress(1.0, desc="✅ Dubbing complete")
257
  return final_video_path, full_report
258
  except Exception as e:
259
  import traceback; traceback.print_exc()
260
  if "index out of range" in str(e):
261
+ return None, f"⚠️ Model inference failed. Error: {str(e)}. It is recommended to complete the input clue description and speaker attributes."
262
  else:
263
+ return None, f"⚠️ Model inference failed. Error: {str(e)}"
264
  else:
265
  time.sleep(1)
266
+ progress(1.0, desc="Simulation complete")
267
  return video_file, full_report
268
 
269
  except Exception as e:
270
  import traceback; traceback.print_exc()
271
+ return None, f"❌ Error: {str(e)}"
272
 
273
 
274
  # ==================== 主程序 ====================
 
276
  def main():
277
  os.makedirs(TEMP_DIR, exist_ok=True)
278
  with gr.Blocks(
279
+ title="Fun-CineForge-Demo",
280
  theme=gr.themes.Soft(),
281
  css="""
282
  .segment-accordion { margin: 10px 0; }
 
288
  gr.Markdown("""
289
  # 🎬 Fun-CineForge
290
 
291
+ **Workflow:** Upload short video Add clip information (or upload .srt subtitle file) Upload reference audio (optional) Preprocessing, model loading, and inference Output dubbed video
292
  """)
293
 
294
  with gr.Row():
295
  with gr.Column(scale=1):
296
+ video_input = gr.Video(label="Upload video", sources=["upload"])
297
+ load_video_btn = gr.Button("📂 Load sample video", variant="secondary", size="sm")
298
+ srt_input = gr.UploadButton("Upload SRT subtitles", file_types=[".srt"], size="sm", variant="secondary")
299
  # with gr.Row(elem_classes=["srt-compact"]):
300
  # srt_input = gr.File(label="上传 SRT 字幕", file_types=[".srt"], height="auto")
301
+ gr.Markdown("### 🎛️ Dubbing clip configuration")
302
 
303
  segments, accordions = create_segments_ui()
304
  seg_count_state = gr.State(1) #🔑记录当前可见片段数
305
+ add_segment_btn = gr.Button("➕Add new clip", size="sm", variant="secondary")
306
+ submit_btn = gr.Button("🚀 Start dubbing", variant="stop", size="lg")
307
 
308
  with gr.Column(scale=1):
309
+ video_output = gr.Video(label="📺 Dubbed video", autoplay=True)
310
 
311
+ status_text = gr.Textbox(label="Result status", interactive=False, lines=2)
312
 
313
  gr.Markdown("""
314
+ ### 📝 Instructions for use
315
+ | Fields | Descriptions |
316
  |------|------|
317
+ | Dubbing script | The content of this clip (supports Chinese/English) |
318
+ | Clue description | Please refer to the sample format to explain the dubbing requirements, focusing on describing the speaker's gender, age, tone, and emotion |
319
+ | Timestamps | Start and end timestamps (accurate to milliseconds). The model is sensitive to timestamps; it is recommended to use timestamps adjacent to the audio clip. Duration 30s/clip |
320
+ | Age/Gender | Speaker attribute options |
321
+ | Reference audio | Voice cloning reference (Optional) |
322
+
323
+ **⚠️ Note:** Ensure that the timestamps of each clip don't overlap and don't exceed the video duration. The model will perform time alignment based on the timestamps, with weak supervision aligning lip movements.
324
  """)
325
 
326
  # ==================== 事件绑定 ====================