#!/usr/bin/env python # -*- coding: utf-8 -*- import os import tempfile import argparse from subprocess import call import subprocess from pdf2image import convert_from_path from pptx import Presentation # from gtts import gTTS __author__ = ['chaonan99'] ## Sometimes ffmpeg is avconv FFMPEG_NAME = 'ffmpeg' # FFMPEG_NAME = 'avconv' import os from typing import Optional from tts.infer_cli import MegaTTS3DiTInfer # adjust import path as needed from tts.utils.audio_utils.io import save_wav def get_tts( input_wav_path: str, input_text: str, output_path: str, time_step: int = 32, p_w: float = 1.6, t_w: float = 2.5, device: Optional[str] = None, ) -> str: """ Generate TTS audio from an input WAV file and text prompt. Args: input_wav_path: Path to the input WAV (prompt) file. input_text: Text to synthesize. output_path: Path to the output audio file. time_step: Diffusion inference steps. p_w: Intelligibility weight. t_w: Similarity weight. device: Device specifier (e.g., 'cuda' or 'cpu'). If None, auto-selected. Returns: The full path to the generated WAV file. """ # Initialize the inference model infer = MegaTTS3DiTInfer(device=device) # Read prompt audio with open(input_wav_path, 'rb') as f: audio_bytes = f.read() # Locate corresponding latent file if available latent_file = None potential_npy = os.path.splitext(input_wav_path)[0] + '.npy' if os.path.isfile(potential_npy): latent_file = potential_npy # Preprocess: extract features and durations resource_context = infer.preprocess(audio_bytes, latent_file=latent_file) # Synthesize speech wav_bytes = infer.forward( resource_context, input_text, time_step=time_step, p_w=p_w, t_w=t_w ) # Ensure output directory exists and save save_wav(wav_bytes, output_path) return output_path def ppt_presenter(pptx_path): cmd = ['libreoffice', '--headless', '--convert-to', 'pdf', pptx_path, '--outdir', os.path.dirname(pptx_path)] result = subprocess.run(cmd, capture_output=True, text=True) pdf_path = os.path.splitext(pptx_path)[0] + '.pdf' output_path = os.path.splitext(pptx_path)[0] + '.mp4' with tempfile.TemporaryDirectory() as temp_path: images_from_path = convert_from_path(pdf_path) prs = Presentation(pptx_path) assert len(images_from_path) == len(prs.slides) for i, (slide, image) in enumerate(zip(prs.slides, images_from_path)): if slide.has_notes_slide: notes = slide.notes_slide.notes_text_frame.text # tts = gTTS(text=notes, lang='en') image_path = os.path.join(temp_path, 'frame_{}.jpg'.format(i)) audio_path = os.path.join(temp_path, 'frame_{}.mp3'.format(i)) image.save(image_path) get_tts("assets/English_prompt.wav", notes, audio_path) # tts.save(audio_path) ffmpeg_call(image_path, audio_path, temp_path, i) video_list = [os.path.join(temp_path, 'frame_{}.ts'.format(i)) \ for i in range(len(images_from_path))] video_list_str = 'concat:' + '|'.join(video_list) ffmpeg_concat(video_list_str, output_path) def ffmpeg_call(image_path, audio_path, temp_path, i): out_path_mp4 = os.path.join(temp_path, 'frame_{}.mp4'.format(i)) out_path_ts = os.path.join(temp_path, 'frame_{}.ts'.format(i)) call([FFMPEG_NAME, '-loop', '1', '-y', '-i', image_path, '-i', audio_path, '-vf', 'scale=2666:1500', '-c:v', 'libx264', '-tune', 'stillimage', '-c:a', 'aac', '-b:a', '192k', '-pix_fmt', 'yuv420p', '-shortest', out_path_mp4]) call([FFMPEG_NAME, '-y', '-i', out_path_mp4, '-c', 'copy', '-bsf:v', 'h264_mp4toannexb', '-f', 'mpegts', out_path_ts]) def ffmpeg_concat(video_list_str, out_path): call([FFMPEG_NAME, '-y', '-f', 'mpegts', '-i', '{}'.format(video_list_str), '-c', 'copy', '-bsf:a', 'aac_adtstoasc', out_path]) def main(): parser = argparse.ArgumentParser(description='PPT Presenter help.') parser.add_argument('--pptx', default='../../ppagent_2025-06-29_152592d9-df14-48d0-b6de-99fa7fe4fdac.pptx', help='input pptx path') args = parser.parse_args() ppt_presenter(args.pptx) if __name__ == '__main__': main()