Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -69,22 +69,22 @@ def create_segments_ui():
|
|
| 69 |
segments = []
|
| 70 |
accordions = []
|
| 71 |
for i in range(MAX_SEGMENTS):
|
| 72 |
-
with gr.Accordion(f"🎬
|
| 73 |
accordions.append(acc)
|
| 74 |
with gr.Row():
|
| 75 |
-
text_input = gr.Textbox(label="📝
|
| 76 |
-
clue_input = gr.Textbox(label="💡
|
| 77 |
with gr.Row():
|
| 78 |
-
start_time = gr.Number(label="⏱️
|
| 79 |
-
end_time = gr.Number(label="⏱️
|
| 80 |
with gr.Row():
|
| 81 |
-
age_input = gr.Dropdown(label="👤
|
| 82 |
-
gender_input = gr.Dropdown(label="👤
|
| 83 |
with gr.Row():
|
| 84 |
-
ref_audio = gr.Audio(label="🎤
|
| 85 |
-
load_audio_btn = gr.Button("📂
|
| 86 |
with gr.Row():
|
| 87 |
-
enable_check = gr.Checkbox(label="
|
| 88 |
|
| 89 |
segments.append({
|
| 90 |
"accordion": acc, "text": text_input, "clue": clue_input, "start": start_time, "end": end_time,
|
|
@@ -95,11 +95,11 @@ def create_segments_ui():
|
|
| 95 |
def add_segment_fn(current_count):
|
| 96 |
"""点击加号:显示下一个片段,到达上限则禁用按钮"""
|
| 97 |
if current_count >= MAX_SEGMENTS:
|
| 98 |
-
return [current_count] + [gr.update() for _ in range(MAX_SEGMENTS)] + [gr.update(interactive=False, value=f"
|
| 99 |
|
| 100 |
new_count = current_count + 1
|
| 101 |
vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
|
| 102 |
-
btn = gr.update(interactive=(new_count < MAX_SEGMENTS), value="➕
|
| 103 |
return [new_count] + vis + [btn]
|
| 104 |
|
| 105 |
def load_srt_fn(srt_file, current_count):
|
|
@@ -111,11 +111,11 @@ def load_srt_fn(srt_file, current_count):
|
|
| 111 |
with open(srt_file, 'r', encoding='utf-8-sig') as f:
|
| 112 |
content = f.read()
|
| 113 |
except Exception as e:
|
| 114 |
-
gr.Warning(f"
|
| 115 |
return [current_count] + empty_fields + empty_vis + [gr.update()]
|
| 116 |
parsed = parse_srt_content(content)
|
| 117 |
if not parsed:
|
| 118 |
-
print("
|
| 119 |
return [current_count] + empty_fields + empty_vis + [gr.update()]
|
| 120 |
updates = []
|
| 121 |
for i in range(MAX_SEGMENTS):
|
|
@@ -134,24 +134,24 @@ def load_srt_fn(srt_file, current_count):
|
|
| 134 |
vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
|
| 135 |
btn = gr.update(interactive=(new_count < MAX_SEGMENTS))
|
| 136 |
if len(parsed) > MAX_SEGMENTS:
|
| 137 |
-
gr.Warning(f"SRT
|
| 138 |
|
| 139 |
return [new_count] + updates + vis + [btn]
|
| 140 |
|
| 141 |
def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
|
| 142 |
"""主推理流程"""
|
| 143 |
if not video_file:
|
| 144 |
-
return None, "❌
|
| 145 |
|
| 146 |
video_duration = get_video_duration(video_file)
|
| 147 |
if video_duration <= 0:
|
| 148 |
-
return None, "❌
|
| 149 |
|
| 150 |
if os.path.exists(TEMP_DIR):
|
| 151 |
try:
|
| 152 |
shutil.rmtree(TEMP_DIR)
|
| 153 |
except Exception as e:
|
| 154 |
-
return None, f"❌
|
| 155 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
| 156 |
|
| 157 |
# 解析 segment_inputs
|
|
@@ -172,51 +172,51 @@ def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
|
|
| 172 |
|
| 173 |
errors = validate_timestamps(start, end, video_duration)
|
| 174 |
if errors:
|
| 175 |
-
return None, f"❌
|
| 176 |
|
| 177 |
data = {
|
| 178 |
"text": str(text).strip(),
|
| 179 |
"clue": str(clue) if clue else "",
|
| 180 |
"start": float(start) if start else 0.0,
|
| 181 |
"end": float(end) if end else 0.0,
|
| 182 |
-
"age": str(age) if age else "
|
| 183 |
-
"gender": str(gender) if gender else "
|
| 184 |
"ref_audio": str(ref_audio) if ref_audio else ""
|
| 185 |
}
|
| 186 |
|
| 187 |
segments_data.append(data)
|
| 188 |
|
| 189 |
if not segments_data:
|
| 190 |
-
return None, "❌
|
| 191 |
|
| 192 |
try:
|
| 193 |
-
progress(0.1, desc="📋
|
| 194 |
frontend = init_frontend_models()
|
| 195 |
jsonl_path, jsonl_items = generate_jsonl_data(frontend, video_file, segments_data, TEMP_DIR, video_duration)
|
| 196 |
-
report_lines = [f"✅
|
| 197 |
for idx, item in enumerate(jsonl_items):
|
| 198 |
-
report_lines.extend([f"\n---
|
| 199 |
full_report = "\n".join(report_lines)
|
| 200 |
|
| 201 |
-
progress(0.3, desc="🔄 FunCineForge
|
| 202 |
|
| 203 |
eng = init_engine()
|
| 204 |
if eng and jsonl_items:
|
| 205 |
try:
|
| 206 |
-
progress(0.5, desc="🚀 FunCineForge
|
| 207 |
eng.inference(jsonl_path)
|
| 208 |
|
| 209 |
-
progress(0.8, desc="🎵
|
| 210 |
|
| 211 |
output_wav_dir = os.path.join(TEMP_DIR, "wav")
|
| 212 |
final_video_path = os.path.join(TEMP_DIR, "dubbed_video.mp4")
|
| 213 |
|
| 214 |
if not os.path.exists(output_wav_dir):
|
| 215 |
-
return None, f"⚠️
|
| 216 |
|
| 217 |
wav_files = sorted([f for f in os.listdir(output_wav_dir) if f.endswith('.wav')])
|
| 218 |
if not wav_files:
|
| 219 |
-
return None, f"⚠️
|
| 220 |
|
| 221 |
time_mapping = {}
|
| 222 |
for item in jsonl_items:
|
|
@@ -253,22 +253,22 @@ def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
|
|
| 253 |
if 'final_audio' in locals(): final_audio.close()
|
| 254 |
final_clip.close()
|
| 255 |
|
| 256 |
-
progress(1.0, desc="✅
|
| 257 |
return final_video_path, full_report
|
| 258 |
except Exception as e:
|
| 259 |
import traceback; traceback.print_exc()
|
| 260 |
if "index out of range" in str(e):
|
| 261 |
-
return None, f"⚠️
|
| 262 |
else:
|
| 263 |
-
return None, f"⚠️
|
| 264 |
else:
|
| 265 |
time.sleep(1)
|
| 266 |
-
progress(1.0, desc="
|
| 267 |
return video_file, full_report
|
| 268 |
|
| 269 |
except Exception as e:
|
| 270 |
import traceback; traceback.print_exc()
|
| 271 |
-
return None, f"❌
|
| 272 |
|
| 273 |
|
| 274 |
# ==================== 主程序 ====================
|
|
@@ -276,7 +276,7 @@ def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
|
|
| 276 |
def main():
|
| 277 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
| 278 |
with gr.Blocks(
|
| 279 |
-
title="Fun-CineForge
|
| 280 |
theme=gr.themes.Soft(),
|
| 281 |
css="""
|
| 282 |
.segment-accordion { margin: 10px 0; }
|
|
@@ -288,39 +288,39 @@ def main():
|
|
| 288 |
gr.Markdown("""
|
| 289 |
# 🎬 Fun-CineForge
|
| 290 |
|
| 291 |
-
**
|
| 292 |
""")
|
| 293 |
|
| 294 |
with gr.Row():
|
| 295 |
with gr.Column(scale=1):
|
| 296 |
-
video_input = gr.Video(label="
|
| 297 |
-
load_video_btn = gr.Button("📂
|
| 298 |
-
srt_input = gr.UploadButton("
|
| 299 |
# with gr.Row(elem_classes=["srt-compact"]):
|
| 300 |
# srt_input = gr.File(label="上传 SRT 字幕", file_types=[".srt"], height="auto")
|
| 301 |
-
gr.Markdown("### 🎛️
|
| 302 |
|
| 303 |
segments, accordions = create_segments_ui()
|
| 304 |
seg_count_state = gr.State(1) #🔑记录当前可见片段数
|
| 305 |
-
add_segment_btn = gr.Button("➕
|
| 306 |
-
submit_btn = gr.Button("🚀
|
| 307 |
|
| 308 |
with gr.Column(scale=1):
|
| 309 |
-
video_output = gr.Video(label="📺
|
| 310 |
|
| 311 |
-
status_text = gr.Textbox(label="
|
| 312 |
|
| 313 |
gr.Markdown("""
|
| 314 |
-
### 📝
|
| 315 |
-
|
|
| 316 |
|------|------|
|
| 317 |
-
|
|
| 318 |
-
|
|
| 319 |
-
|
|
| 320 |
-
|
|
| 321 |
-
|
|
| 322 |
-
|
| 323 |
-
**⚠️
|
| 324 |
""")
|
| 325 |
|
| 326 |
# ==================== 事件绑定 ====================
|
|
|
|
| 69 |
segments = []
|
| 70 |
accordions = []
|
| 71 |
for i in range(MAX_SEGMENTS):
|
| 72 |
+
with gr.Accordion(f"🎬 Dubbing clip {i + 1}", open=(i == 0), visible=(i == 0)) as acc:
|
| 73 |
accordions.append(acc)
|
| 74 |
with gr.Row():
|
| 75 |
+
text_input = gr.Textbox(label="📝 Dubbing script", placeholder="Please enter the script...", lines=2, scale=3, elem_id=f"text_{i}")
|
| 76 |
+
clue_input = gr.Textbox(label="💡 Clue description", placeholder="一位中年男性角色语气沉稳且坚定,流露出对自身忠诚的强烈自信与决心。整体情感是忠贞不渝的承诺和不容置疑的信念。", lines=2, scale=3, elem_id=f"clue_{i}")
|
| 77 |
with gr.Row():
|
| 78 |
+
start_time = gr.Number(label="⏱️ Start timestamp (s)", value=0.0 + i*5, precision=2, scale=2, elem_id=f"start_{i}")
|
| 79 |
+
end_time = gr.Number(label="⏱️ End timestamp (s)", value=5.0 + i*5, precision=2, scale=2, elem_id=f"end_{i}")
|
| 80 |
with gr.Row():
|
| 81 |
+
age_input = gr.Dropdown(label="👤 Age", choices=["child", "teenager", "adult", "middle-aged", "elderly", "unknown"], value="unknown", scale=2, elem_id=f"age_{i}")
|
| 82 |
+
gender_input = gr.Dropdown(label="👤 Gender", choices=["male", "female", "unknown"], value="unknown", scale=2, elem_id=f"gender_{i}")
|
| 83 |
with gr.Row():
|
| 84 |
+
ref_audio = gr.Audio(label="🎤 Reference audio (optional, the video's audio is used as the reference audio by default).", sources=["upload"], type="filepath", scale=4,elem_id=f"audio_{i}")
|
| 85 |
+
load_audio_btn = gr.Button("📂 Load sample audio", size="sm", variant="secondary", scale=1) if i == 0 else None
|
| 86 |
with gr.Row():
|
| 87 |
+
enable_check = gr.Checkbox(label="Enable this clip", value=(i == 0), scale=1, elem_id=f"enable_{i}")
|
| 88 |
|
| 89 |
segments.append({
|
| 90 |
"accordion": acc, "text": text_input, "clue": clue_input, "start": start_time, "end": end_time,
|
|
|
|
| 95 |
def add_segment_fn(current_count):
|
| 96 |
"""点击加号:显示下一个片段,到达上限则禁用按钮"""
|
| 97 |
if current_count >= MAX_SEGMENTS:
|
| 98 |
+
return [current_count] + [gr.update() for _ in range(MAX_SEGMENTS)] + [gr.update(interactive=False, value=f"The limit has been reached. ({MAX_SEGMENTS})")]
|
| 99 |
|
| 100 |
new_count = current_count + 1
|
| 101 |
vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
|
| 102 |
+
btn = gr.update(interactive=(new_count < MAX_SEGMENTS), value="➕ New clip")
|
| 103 |
return [new_count] + vis + [btn]
|
| 104 |
|
| 105 |
def load_srt_fn(srt_file, current_count):
|
|
|
|
| 111 |
with open(srt_file, 'r', encoding='utf-8-sig') as f:
|
| 112 |
content = f.read()
|
| 113 |
except Exception as e:
|
| 114 |
+
gr.Warning(f"Failed to read SRT file: {e}")
|
| 115 |
return [current_count] + empty_fields + empty_vis + [gr.update()]
|
| 116 |
parsed = parse_srt_content(content)
|
| 117 |
if not parsed:
|
| 118 |
+
print(" No valid subtitles were parsed. Please check the SRT format.")
|
| 119 |
return [current_count] + empty_fields + empty_vis + [gr.update()]
|
| 120 |
updates = []
|
| 121 |
for i in range(MAX_SEGMENTS):
|
|
|
|
| 134 |
vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
|
| 135 |
btn = gr.update(interactive=(new_count < MAX_SEGMENTS))
|
| 136 |
if len(parsed) > MAX_SEGMENTS:
|
| 137 |
+
gr.Warning(f"The SRT contains {len(parsed)} fragments, of which the first {MAX_SEGMENTS} have been truncated.")
|
| 138 |
|
| 139 |
return [new_count] + updates + vis + [btn]
|
| 140 |
|
| 141 |
def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
|
| 142 |
"""主推理流程"""
|
| 143 |
if not video_file:
|
| 144 |
+
return None, "❌ Please upload the video file."
|
| 145 |
|
| 146 |
video_duration = get_video_duration(video_file)
|
| 147 |
if video_duration <= 0:
|
| 148 |
+
return None, "❌ Unable to obtain video duration, please check the video file."
|
| 149 |
|
| 150 |
if os.path.exists(TEMP_DIR):
|
| 151 |
try:
|
| 152 |
shutil.rmtree(TEMP_DIR)
|
| 153 |
except Exception as e:
|
| 154 |
+
return None, f"❌ Failed to clear temporary directory:{e}"
|
| 155 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
| 156 |
|
| 157 |
# 解析 segment_inputs
|
|
|
|
| 172 |
|
| 173 |
errors = validate_timestamps(start, end, video_duration)
|
| 174 |
if errors:
|
| 175 |
+
return None, f"❌ Clip {i+1} timestamp error:\n" + "\n".join(errors)
|
| 176 |
|
| 177 |
data = {
|
| 178 |
"text": str(text).strip(),
|
| 179 |
"clue": str(clue) if clue else "",
|
| 180 |
"start": float(start) if start else 0.0,
|
| 181 |
"end": float(end) if end else 0.0,
|
| 182 |
+
"age": str(age) if age else "unknown",
|
| 183 |
+
"gender": str(gender) if gender else "unknown",
|
| 184 |
"ref_audio": str(ref_audio) if ref_audio else ""
|
| 185 |
}
|
| 186 |
|
| 187 |
segments_data.append(data)
|
| 188 |
|
| 189 |
if not segments_data:
|
| 190 |
+
return None, "❌ The valid clip data is empty. Please enable and fill in at least one clip."
|
| 191 |
|
| 192 |
try:
|
| 193 |
+
progress(0.1, desc="📋 Preprocess the video to generate JSONL data...")
|
| 194 |
frontend = init_frontend_models()
|
| 195 |
jsonl_path, jsonl_items = generate_jsonl_data(frontend, video_file, segments_data, TEMP_DIR, video_duration)
|
| 196 |
+
report_lines = [f"✅ Task completed! A total of **{len(jsonl_items)}** data fragments were generated.\n", "Detailed JSONL data preview:**", "=" * 40]
|
| 197 |
for idx, item in enumerate(jsonl_items):
|
| 198 |
+
report_lines.extend([f"\n---Clip #{idx + 1} ---", json.dumps(item, ensure_ascii=False, indent=2), "-" * 40])
|
| 199 |
full_report = "\n".join(report_lines)
|
| 200 |
|
| 201 |
+
progress(0.3, desc="🔄 FunCineForge dubbing model loading...")
|
| 202 |
|
| 203 |
eng = init_engine()
|
| 204 |
if eng and jsonl_items:
|
| 205 |
try:
|
| 206 |
+
progress(0.5, desc="🚀 FunCineForge dubbing model inference...")
|
| 207 |
eng.inference(jsonl_path)
|
| 208 |
|
| 209 |
+
progress(0.8, desc="🎵 Pasting the voiceover back into the muted video...")
|
| 210 |
|
| 211 |
output_wav_dir = os.path.join(TEMP_DIR, "wav")
|
| 212 |
final_video_path = os.path.join(TEMP_DIR, "dubbed_video.mp4")
|
| 213 |
|
| 214 |
if not os.path.exists(output_wav_dir):
|
| 215 |
+
return None, f"⚠️ Audio output directory not found:{output_wav_dir}"
|
| 216 |
|
| 217 |
wav_files = sorted([f for f in os.listdir(output_wav_dir) if f.endswith('.wav')])
|
| 218 |
if not wav_files:
|
| 219 |
+
return None, f"⚠️ No audio files were generated:{output_wav_dir}"
|
| 220 |
|
| 221 |
time_mapping = {}
|
| 222 |
for item in jsonl_items:
|
|
|
|
| 253 |
if 'final_audio' in locals(): final_audio.close()
|
| 254 |
final_clip.close()
|
| 255 |
|
| 256 |
+
progress(1.0, desc="✅ Dubbing complete")
|
| 257 |
return final_video_path, full_report
|
| 258 |
except Exception as e:
|
| 259 |
import traceback; traceback.print_exc()
|
| 260 |
if "index out of range" in str(e):
|
| 261 |
+
return None, f"⚠️ Model inference failed. Error: {str(e)}. It is recommended to complete the input clue description and speaker attributes."
|
| 262 |
else:
|
| 263 |
+
return None, f"⚠️ Model inference failed. Error: {str(e)}"
|
| 264 |
else:
|
| 265 |
time.sleep(1)
|
| 266 |
+
progress(1.0, desc="Simulation complete")
|
| 267 |
return video_file, full_report
|
| 268 |
|
| 269 |
except Exception as e:
|
| 270 |
import traceback; traceback.print_exc()
|
| 271 |
+
return None, f"❌ Error: {str(e)}"
|
| 272 |
|
| 273 |
|
| 274 |
# ==================== 主程序 ====================
|
|
|
|
| 276 |
def main():
|
| 277 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
| 278 |
with gr.Blocks(
|
| 279 |
+
title="Fun-CineForge-Demo",
|
| 280 |
theme=gr.themes.Soft(),
|
| 281 |
css="""
|
| 282 |
.segment-accordion { margin: 10px 0; }
|
|
|
|
| 288 |
gr.Markdown("""
|
| 289 |
# 🎬 Fun-CineForge
|
| 290 |
|
| 291 |
+
**Workflow:** Upload short video → Add clip information (or upload .srt subtitle file) → Upload reference audio (optional) → Preprocessing, model loading, and inference → Output dubbed video
|
| 292 |
""")
|
| 293 |
|
| 294 |
with gr.Row():
|
| 295 |
with gr.Column(scale=1):
|
| 296 |
+
video_input = gr.Video(label="Upload video", sources=["upload"])
|
| 297 |
+
load_video_btn = gr.Button("📂 Load sample video", variant="secondary", size="sm")
|
| 298 |
+
srt_input = gr.UploadButton("Upload SRT subtitles", file_types=[".srt"], size="sm", variant="secondary")
|
| 299 |
# with gr.Row(elem_classes=["srt-compact"]):
|
| 300 |
# srt_input = gr.File(label="上传 SRT 字幕", file_types=[".srt"], height="auto")
|
| 301 |
+
gr.Markdown("### 🎛️ Dubbing clip configuration")
|
| 302 |
|
| 303 |
segments, accordions = create_segments_ui()
|
| 304 |
seg_count_state = gr.State(1) #🔑记录当前可见片段数
|
| 305 |
+
add_segment_btn = gr.Button("➕Add new clip", size="sm", variant="secondary")
|
| 306 |
+
submit_btn = gr.Button("🚀 Start dubbing", variant="stop", size="lg")
|
| 307 |
|
| 308 |
with gr.Column(scale=1):
|
| 309 |
+
video_output = gr.Video(label="📺 Dubbed video", autoplay=True)
|
| 310 |
|
| 311 |
+
status_text = gr.Textbox(label="Result status", interactive=False, lines=2)
|
| 312 |
|
| 313 |
gr.Markdown("""
|
| 314 |
+
### 📝 Instructions for use
|
| 315 |
+
| Fields | Descriptions |
|
| 316 |
|------|------|
|
| 317 |
+
| Dubbing script | The content of this clip (supports Chinese/English) |
|
| 318 |
+
| Clue description | Please refer to the sample format to explain the dubbing requirements, focusing on describing the speaker's gender, age, tone, and emotion |
|
| 319 |
+
| Timestamps | Start and end timestamps (accurate to milliseconds). The model is sensitive to timestamps; it is recommended to use timestamps adjacent to the audio clip. Duration ≤ 30s/clip |
|
| 320 |
+
| Age/Gender | Speaker attribute options |
|
| 321 |
+
| Reference audio | Voice cloning reference (Optional) |
|
| 322 |
+
|
| 323 |
+
**⚠️ Note:** Ensure that the timestamps of each clip don't overlap and don't exceed the video duration. The model will perform time alignment based on the timestamps, with weak supervision aligning lip movements.
|
| 324 |
""")
|
| 325 |
|
| 326 |
# ==================== 事件绑定 ====================
|