Spaces:

AIGeeksGroup
/

PresentAgent

Runtime error

App Files Files Community

PresentAgent / MegaTTS3 /test.py

sjw712

Upload 208 files

d961e88 verified 7 months ago

raw

history blame contribute delete

4.61 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --


	import os
	import tempfile
	import argparse
	from subprocess import call
	import subprocess
	from pdf2image import convert_from_path
	from pptx import Presentation
	# from gtts import gTTS


	__author__ = ['chaonan99']


	## Sometimes ffmpeg is avconv
	FFMPEG_NAME = 'ffmpeg'
	# FFMPEG_NAME = 'avconv'

	import os
	from typing import Optional

	from tts.infer_cli import MegaTTS3DiTInfer # adjust import path as needed
	from tts.utils.audio_utils.io import save_wav


	def get_tts(
	input_wav_path: str,
	input_text: str,
	output_path: str,
	time_step: int = 32,
	p_w: float = 1.6,
	t_w: float = 2.5,
	device: Optional[str] = None,
	) -> str:
	"""
	Generate TTS audio from an input WAV file and text prompt.

	Args:
	input_wav_path: Path to the input WAV (prompt) file.
	input_text: Text to synthesize.
	output_path: Path to the output audio file.
	time_step: Diffusion inference steps.
	p_w: Intelligibility weight.
	t_w: Similarity weight.
	device: Device specifier (e.g., 'cuda' or 'cpu'). If None, auto-selected.

	Returns:
	The full path to the generated WAV file.
	"""
	# Initialize the inference model
	infer = MegaTTS3DiTInfer(device=device)

	# Read prompt audio
	with open(input_wav_path, 'rb') as f:
	audio_bytes = f.read()

	# Locate corresponding latent file if available
	latent_file = None
	potential_npy = os.path.splitext(input_wav_path)[0] + '.npy'
	if os.path.isfile(potential_npy):
	latent_file = potential_npy

	# Preprocess: extract features and durations
	resource_context = infer.preprocess(audio_bytes, latent_file=latent_file)

	# Synthesize speech
	wav_bytes = infer.forward(
	resource_context,
	input_text,
	time_step=time_step,
	p_w=p_w,
	t_w=t_w
	)

	# Ensure output directory exists and save
	save_wav(wav_bytes, output_path)

	return output_path





	def ppt_presenter(pptx_path):
	cmd = ['libreoffice', '--headless', '--convert-to', 'pdf', pptx_path, '--outdir', os.path.dirname(pptx_path)]
	result = subprocess.run(cmd, capture_output=True, text=True)

	pdf_path = os.path.splitext(pptx_path)[0] + '.pdf'
	output_path = os.path.splitext(pptx_path)[0] + '.mp4'
	with tempfile.TemporaryDirectory() as temp_path:
	images_from_path = convert_from_path(pdf_path)
	prs = Presentation(pptx_path)
	assert len(images_from_path) == len(prs.slides)
	for i, (slide, image) in enumerate(zip(prs.slides, images_from_path)):
	if slide.has_notes_slide:
	notes = slide.notes_slide.notes_text_frame.text

	# tts = gTTS(text=notes, lang='en')
	image_path = os.path.join(temp_path, 'frame_{}.jpg'.format(i))
	audio_path = os.path.join(temp_path, 'frame_{}.mp3'.format(i))

	image.save(image_path)
	get_tts("assets/English_prompt.wav", notes, audio_path)
	# tts.save(audio_path)

	ffmpeg_call(image_path, audio_path, temp_path, i)

	video_list = [os.path.join(temp_path, 'frame_{}.ts'.format(i)) \
	for i in range(len(images_from_path))]
	video_list_str = 'concat:' + '\|'.join(video_list)
	ffmpeg_concat(video_list_str, output_path)


	def ffmpeg_call(image_path, audio_path, temp_path, i):
	out_path_mp4 = os.path.join(temp_path, 'frame_{}.mp4'.format(i))
	out_path_ts = os.path.join(temp_path, 'frame_{}.ts'.format(i))
	call([FFMPEG_NAME, '-loop', '1', '-y', '-i', image_path, '-i', audio_path,
	'-vf', 'scale=2666:1500', '-c:v', 'libx264', '-tune', 'stillimage', '-c:a', 'aac',
	'-b:a', '192k', '-pix_fmt', 'yuv420p', '-shortest', out_path_mp4])

	call([FFMPEG_NAME, '-y', '-i', out_path_mp4, '-c', 'copy',
	'-bsf:v', 'h264_mp4toannexb', '-f', 'mpegts', out_path_ts])


	def ffmpeg_concat(video_list_str, out_path):
	call([FFMPEG_NAME, '-y', '-f', 'mpegts', '-i', '{}'.format(video_list_str),
	'-c', 'copy', '-bsf:a', 'aac_adtstoasc', out_path])


	def main():
	parser = argparse.ArgumentParser(description='PPT Presenter help.')
	parser.add_argument('--pptx', default='../../ppagent_2025-06-29_152592d9-df14-48d0-b6de-99fa7fe4fdac.pptx', help='input pptx path')
	args = parser.parse_args()
	ppt_presenter(args.pptx)


	if __name__ == '__main__':
	main()