import os import re import tempfile import subprocess import gradio as gr from pydub import AudioSegment import soundfile as sf import NeuTTS_Air_q4_GGUF # Load NeuTTS-Air (Q8-GGUF) print("🧠 Loading NeuTTS-Air (Q4-GGUF)...") tts = NeuTTSAir( backbone_repo="neuphonic/neutts-air-q4-gguf", backbone_device="cpu", codec_repo="neuphonic/neucodec", codec_device="cpu" ) def parse_host_script(script): lines = script.strip().split('\n') segments = [] for line in lines: stripped = line.strip() if not stripped: continue if stripped.startswith("HOST 1:"): segments.append(("HOST 1", stripped[len("HOST 1:"):].strip())) elif stripped.startswith("HOST 2:"): segments.append(("HOST 2", stripped[len("HOST 2:"):].strip())) else: segments.append(("HOST 1", stripped)) return segments def generate_audio_from_script(script, ref1_wav, ref1_txt, ref2_wav, ref2_txt): segments = parse_host_script(script) ref_map = { "HOST 1": (ref1_wav, ref1_txt), "HOST 2": (ref2_wav, ref2_txt) } output_files = [] for i, (tag, text) in enumerate(segments): ref_wav, ref_text = ref_map.get(tag, ref_map["HOST 1"]) ref_codes = tts.encode_reference(ref_wav) wav = tts.infer(text, ref_codes, ref_text) out_path = f"/tmp/seg_{i}.wav" sf.write(out_path, wav, 24000) output_files.append(out_path) # Concatenate combined = AudioSegment.empty() for f in output_files: combined += AudioSegment.from_wav(f) final_path = "/tmp/script_audio.wav" combined.export(final_path, format="wav") return final_path def generate_video_with_lipsync(audio_path, ref_video_path): # Your existing lip-sync pipeline here # Example placeholder - replace with your actual code: output_video = f"/tmp/output_{os.path.basename(audio_path)}.mp4" # Placeholder: Use your lip-sync model (e.g., SadTalker, Wav2Lip, etc.) # subprocess.run([...]) # For now, just return the reference video as placeholder # Replace this with your actual lip-sync logic return ref_video def generate_video_podcast(script, ref1_video, ref1_transcript, ref2_video=None, ref2_transcript=None): # Generate audio from script audio_path = generate_audio_from_script(script, ref1_video, ref1_transcript, ref2_video, ref2_transcript) # Generate video with lip-sync video_path = generate_video_with_lipsync(audio_path, ref1_video) return video_path, audio_path, "✅ Video podcast generated!" # Gradio UI with gr.Blocks(title="2nd-Host AI - Video Podcast Generator") as demo: gr.Markdown("# 🎥 2nd-Host AI — Video Podcast Generator") gr.Markdown("Upload reference videos + transcripts. Enter script. Get video with lip-sync.") with gr.Row(): with gr.Column(): gr.Markdown("### HOST 1 Reference") ref1_video = gr.Video(label="Reference Video (15s, face)", sources=["upload"]) ref1_txt = gr.Textbox(label="Transcript", placeholder="What they said in video") with gr.Column(): gr.Markdown("### HOST 2 Reference (Optional)") ref2_video = gr.Video(label="Reference Video (15s, face)", sources=["upload"]) ref2_txt = gr.Textbox(label="Transcript", placeholder="What they said in video") script = gr.Textbox(label="Script (HOST 1: / HOST 2:)", lines=8) btn = gr.Button("Generate Video Podcast") video_out = gr.Video(label="Generated Video") audio_out = gr.Audio(label="Generated Audio") status = gr.Textbox(label="Status") btn.click( generate_video_podcast, inputs=[script, ref1_video, ref1_txt, ref2_video, ref2_txt], outputs=[video_out, audio_out, status] ) demo.launch()