Ai_lip_sync / app.py
Actual-Innocence's picture
Update app.py
c6e4b29 verified
import os
import re
import tempfile
import subprocess
import gradio as gr
from pydub import AudioSegment
import soundfile as sf
import NeuTTS_Air_q4_GGUF
# Load NeuTTS-Air (Q8-GGUF)
print("🧠 Loading NeuTTS-Air (Q4-GGUF)...")
tts = NeuTTSAir(
backbone_repo="neuphonic/neutts-air-q4-gguf",
backbone_device="cpu",
codec_repo="neuphonic/neucodec",
codec_device="cpu"
)
def parse_host_script(script):
lines = script.strip().split('\n')
segments = []
for line in lines:
stripped = line.strip()
if not stripped:
continue
if stripped.startswith("HOST 1:"):
segments.append(("HOST 1", stripped[len("HOST 1:"):].strip()))
elif stripped.startswith("HOST 2:"):
segments.append(("HOST 2", stripped[len("HOST 2:"):].strip()))
else:
segments.append(("HOST 1", stripped))
return segments
def generate_audio_from_script(script, ref1_wav, ref1_txt, ref2_wav, ref2_txt):
segments = parse_host_script(script)
ref_map = {
"HOST 1": (ref1_wav, ref1_txt),
"HOST 2": (ref2_wav, ref2_txt)
}
output_files = []
for i, (tag, text) in enumerate(segments):
ref_wav, ref_text = ref_map.get(tag, ref_map["HOST 1"])
ref_codes = tts.encode_reference(ref_wav)
wav = tts.infer(text, ref_codes, ref_text)
out_path = f"/tmp/seg_{i}.wav"
sf.write(out_path, wav, 24000)
output_files.append(out_path)
# Concatenate
combined = AudioSegment.empty()
for f in output_files:
combined += AudioSegment.from_wav(f)
final_path = "/tmp/script_audio.wav"
combined.export(final_path, format="wav")
return final_path
def generate_video_with_lipsync(audio_path, ref_video_path):
# Your existing lip-sync pipeline here
# Example placeholder - replace with your actual code:
output_video = f"/tmp/output_{os.path.basename(audio_path)}.mp4"
# Placeholder: Use your lip-sync model (e.g., SadTalker, Wav2Lip, etc.)
# subprocess.run([...])
# For now, just return the reference video as placeholder
# Replace this with your actual lip-sync logic
return ref_video
def generate_video_podcast(script, ref1_video, ref1_transcript, ref2_video=None, ref2_transcript=None):
# Generate audio from script
audio_path = generate_audio_from_script(script, ref1_video, ref1_transcript, ref2_video, ref2_transcript)
# Generate video with lip-sync
video_path = generate_video_with_lipsync(audio_path, ref1_video)
return video_path, audio_path, "✅ Video podcast generated!"
# Gradio UI
with gr.Blocks(title="2nd-Host AI - Video Podcast Generator") as demo:
gr.Markdown("# 🎥 2nd-Host AI — Video Podcast Generator")
gr.Markdown("Upload reference videos + transcripts. Enter script. Get video with lip-sync.")
with gr.Row():
with gr.Column():
gr.Markdown("### HOST 1 Reference")
ref1_video = gr.Video(label="Reference Video (15s, face)", sources=["upload"])
ref1_txt = gr.Textbox(label="Transcript", placeholder="What they said in video")
with gr.Column():
gr.Markdown("### HOST 2 Reference (Optional)")
ref2_video = gr.Video(label="Reference Video (15s, face)", sources=["upload"])
ref2_txt = gr.Textbox(label="Transcript", placeholder="What they said in video")
script = gr.Textbox(label="Script (HOST 1: / HOST 2:)", lines=8)
btn = gr.Button("Generate Video Podcast")
video_out = gr.Video(label="Generated Video")
audio_out = gr.Audio(label="Generated Audio")
status = gr.Textbox(label="Status")
btn.click(
generate_video_podcast,
inputs=[script, ref1_video, ref1_txt, ref2_video, ref2_txt],
outputs=[video_out, audio_out, status]
)
demo.launch()