Spaces:

AIGeeksGroup
/

PresentAgent

Runtime error

File size: 4,609 Bytes

d961e88

#!/usr/bin/env python
# -*- coding: utf-8 -*-


import os
import tempfile
import argparse
from subprocess import call
import subprocess
from pdf2image import convert_from_path
from pptx import Presentation
# from gtts import gTTS


__author__ = ['chaonan99']


## Sometimes ffmpeg is avconv
FFMPEG_NAME = 'ffmpeg'
# FFMPEG_NAME = 'avconv'

import os
from typing import Optional

from tts.infer_cli import MegaTTS3DiTInfer  # adjust import path as needed
from tts.utils.audio_utils.io import save_wav


def get_tts(

    input_wav_path: str,

    input_text: str,

    output_path: str,

    time_step: int = 32,

    p_w: float = 1.6,

    t_w: float = 2.5,

    device: Optional[str] = None,

) -> str:
    """

    Generate TTS audio from an input WAV file and text prompt.



    Args:

        input_wav_path: Path to the input WAV (prompt) file.

        input_text: Text to synthesize.

        output_path: Path to the output audio file.

        time_step: Diffusion inference steps.

        p_w: Intelligibility weight.

        t_w: Similarity weight.

        device: Device specifier (e.g., 'cuda' or 'cpu'). If None, auto-selected.



    Returns:

        The full path to the generated WAV file.

    """
    # Initialize the inference model
    infer = MegaTTS3DiTInfer(device=device)

    # Read prompt audio
    with open(input_wav_path, 'rb') as f:
        audio_bytes = f.read()

    # Locate corresponding latent file if available
    latent_file = None
    potential_npy = os.path.splitext(input_wav_path)[0] + '.npy'
    if os.path.isfile(potential_npy):
        latent_file = potential_npy

    # Preprocess: extract features and durations
    resource_context = infer.preprocess(audio_bytes, latent_file=latent_file)

    # Synthesize speech
    wav_bytes = infer.forward(
        resource_context,
        input_text,
        time_step=time_step,
        p_w=p_w,
        t_w=t_w
    )

    # Ensure output directory exists and save
    save_wav(wav_bytes, output_path)

    return output_path





def ppt_presenter(pptx_path):
    cmd = ['libreoffice', '--headless', '--convert-to', 'pdf', pptx_path, '--outdir', os.path.dirname(pptx_path)]
    result = subprocess.run(cmd, capture_output=True, text=True)

    pdf_path = os.path.splitext(pptx_path)[0] + '.pdf'
    output_path = os.path.splitext(pptx_path)[0] + '.mp4'
    with tempfile.TemporaryDirectory() as temp_path:
        images_from_path = convert_from_path(pdf_path)
        prs = Presentation(pptx_path)
        assert len(images_from_path) == len(prs.slides)
        for i, (slide, image) in enumerate(zip(prs.slides, images_from_path)):
            if slide.has_notes_slide:
                notes = slide.notes_slide.notes_text_frame.text

                # tts = gTTS(text=notes, lang='en')
                image_path = os.path.join(temp_path, 'frame_{}.jpg'.format(i))
                audio_path = os.path.join(temp_path, 'frame_{}.mp3'.format(i))

                image.save(image_path)
                get_tts("assets/English_prompt.wav", notes, audio_path)
                # tts.save(audio_path)

                ffmpeg_call(image_path, audio_path, temp_path, i)

        video_list = [os.path.join(temp_path, 'frame_{}.ts'.format(i)) \
                      for i in range(len(images_from_path))]
        video_list_str = 'concat:' + '|'.join(video_list)
        ffmpeg_concat(video_list_str, output_path)


def ffmpeg_call(image_path, audio_path, temp_path, i):
    out_path_mp4 = os.path.join(temp_path, 'frame_{}.mp4'.format(i))
    out_path_ts = os.path.join(temp_path, 'frame_{}.ts'.format(i))
    call([FFMPEG_NAME, '-loop', '1', '-y', '-i', image_path, '-i', audio_path,
          '-vf', 'scale=2666:1500', '-c:v', 'libx264', '-tune', 'stillimage', '-c:a', 'aac',
          '-b:a', '192k', '-pix_fmt', 'yuv420p', '-shortest', out_path_mp4])

    call([FFMPEG_NAME, '-y', '-i', out_path_mp4, '-c', 'copy',
          '-bsf:v', 'h264_mp4toannexb', '-f', 'mpegts', out_path_ts])


def ffmpeg_concat(video_list_str, out_path):
    call([FFMPEG_NAME, '-y', '-f', 'mpegts', '-i', '{}'.format(video_list_str),
          '-c', 'copy', '-bsf:a', 'aac_adtstoasc', out_path])


def main():
    parser = argparse.ArgumentParser(description='PPT Presenter help.')
    parser.add_argument('--pptx', default='../../ppagent_2025-06-29_152592d9-df14-48d0-b6de-99fa7fe4fdac.pptx', help='input pptx path')
    args = parser.parse_args()
    ppt_presenter(args.pptx)


if __name__ == '__main__':
    main()