sjw712's picture
Upload 208 files
d961e88 verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import tempfile
import argparse
from subprocess import call
import subprocess
from pdf2image import convert_from_path
from pptx import Presentation
# from gtts import gTTS
__author__ = ['chaonan99']
## Sometimes ffmpeg is avconv
FFMPEG_NAME = 'ffmpeg'
# FFMPEG_NAME = 'avconv'
import os
from typing import Optional
from tts.infer_cli import MegaTTS3DiTInfer # adjust import path as needed
from tts.utils.audio_utils.io import save_wav
def get_tts(
input_wav_path: str,
input_text: str,
output_path: str,
time_step: int = 32,
p_w: float = 1.6,
t_w: float = 2.5,
device: Optional[str] = None,
) -> str:
"""
Generate TTS audio from an input WAV file and text prompt.
Args:
input_wav_path: Path to the input WAV (prompt) file.
input_text: Text to synthesize.
output_path: Path to the output audio file.
time_step: Diffusion inference steps.
p_w: Intelligibility weight.
t_w: Similarity weight.
device: Device specifier (e.g., 'cuda' or 'cpu'). If None, auto-selected.
Returns:
The full path to the generated WAV file.
"""
# Initialize the inference model
infer = MegaTTS3DiTInfer(device=device)
# Read prompt audio
with open(input_wav_path, 'rb') as f:
audio_bytes = f.read()
# Locate corresponding latent file if available
latent_file = None
potential_npy = os.path.splitext(input_wav_path)[0] + '.npy'
if os.path.isfile(potential_npy):
latent_file = potential_npy
# Preprocess: extract features and durations
resource_context = infer.preprocess(audio_bytes, latent_file=latent_file)
# Synthesize speech
wav_bytes = infer.forward(
resource_context,
input_text,
time_step=time_step,
p_w=p_w,
t_w=t_w
)
# Ensure output directory exists and save
save_wav(wav_bytes, output_path)
return output_path
def ppt_presenter(pptx_path):
cmd = ['libreoffice', '--headless', '--convert-to', 'pdf', pptx_path, '--outdir', os.path.dirname(pptx_path)]
result = subprocess.run(cmd, capture_output=True, text=True)
pdf_path = os.path.splitext(pptx_path)[0] + '.pdf'
output_path = os.path.splitext(pptx_path)[0] + '.mp4'
with tempfile.TemporaryDirectory() as temp_path:
images_from_path = convert_from_path(pdf_path)
prs = Presentation(pptx_path)
assert len(images_from_path) == len(prs.slides)
for i, (slide, image) in enumerate(zip(prs.slides, images_from_path)):
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text
# tts = gTTS(text=notes, lang='en')
image_path = os.path.join(temp_path, 'frame_{}.jpg'.format(i))
audio_path = os.path.join(temp_path, 'frame_{}.mp3'.format(i))
image.save(image_path)
get_tts("assets/English_prompt.wav", notes, audio_path)
# tts.save(audio_path)
ffmpeg_call(image_path, audio_path, temp_path, i)
video_list = [os.path.join(temp_path, 'frame_{}.ts'.format(i)) \
for i in range(len(images_from_path))]
video_list_str = 'concat:' + '|'.join(video_list)
ffmpeg_concat(video_list_str, output_path)
def ffmpeg_call(image_path, audio_path, temp_path, i):
out_path_mp4 = os.path.join(temp_path, 'frame_{}.mp4'.format(i))
out_path_ts = os.path.join(temp_path, 'frame_{}.ts'.format(i))
call([FFMPEG_NAME, '-loop', '1', '-y', '-i', image_path, '-i', audio_path,
'-vf', 'scale=2666:1500', '-c:v', 'libx264', '-tune', 'stillimage', '-c:a', 'aac',
'-b:a', '192k', '-pix_fmt', 'yuv420p', '-shortest', out_path_mp4])
call([FFMPEG_NAME, '-y', '-i', out_path_mp4, '-c', 'copy',
'-bsf:v', 'h264_mp4toannexb', '-f', 'mpegts', out_path_ts])
def ffmpeg_concat(video_list_str, out_path):
call([FFMPEG_NAME, '-y', '-f', 'mpegts', '-i', '{}'.format(video_list_str),
'-c', 'copy', '-bsf:a', 'aac_adtstoasc', out_path])
def main():
parser = argparse.ArgumentParser(description='PPT Presenter help.')
parser.add_argument('--pptx', default='../../ppagent_2025-06-29_152592d9-df14-48d0-b6de-99fa7fe4fdac.pptx', help='input pptx path')
args = parser.parse_args()
ppt_presenter(args.pptx)
if __name__ == '__main__':
main()