Spaces:

zanesmit29
/

pdf_scientific_article_to_speech

Runtime error

pdf_scientific_article_to_speech / app.py

zanesmit29

Fixed typo in notebook and removed unnecessary lines in app.py

62752c5 5 months ago

2.51 kB



	import gradio as gr
	import torch
	import soundfile as sf
	from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	import pdfplumber
	import tqdm
	import soundfile as sf


	# Initialize summarization pipeline globally (load once)
	summarizer = pipeline("summarization", model="t5-large")

	def extract_abstract(path):
	with pdfplumber.open(path) as pdf:
	text = ""
	for page in tqdm.tqdm(pdf.pages):
	txt = page.extract_text(x_tolerance=2)
	if txt:
	text += txt
	abstract_start = text.find("Abstract")
	if text.find("Introduction") != -1:
	abstract_end = text.find("Introduction")
	else:
	abstract_end = text.find("Contents")
	abstract = text[abstract_start+len("Abstract"):abstract_end].strip()
	return abstract

	def summarize_text(text):
	# Add prompt to encourage single sentence summary
	prompt = "Summarize the following text in one sentence: " + text
	result = summarizer(prompt, max_length=68, min_length=40, do_sample=False, num_beams=1, early_stopping=True)
	summary = result[0]['summary_text'].replace(" .", ",").replace(" . ", ", ").strip()
	if not summary.endswith("."):
	summary += "."
	return summary

	def pdf_to_speech(file):
	# Gradio file input provides file.name as local path
	abstract = extract_abstract(file.name)
	summary = summarize_text(abstract)

	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	inputs = processor(text=summary, return_tensors="pt")
	speaker_embeddings = torch.zeros(1, 512) # Neutral speaker embedding

	speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
	speech_np = speech.squeeze().cpu().numpy()

	wav_output_path = "output.wav"
	sf.write(wav_output_path, speech_np, 16000)

	# Return path of wav file for Gradio audio output and download
	return wav_output_path

	# Create Gradio interface with file input and audio output
	demo = gr.Interface(
	fn=pdf_to_speech,
	inputs=gr.File(file_types=[".pdf"]),
	outputs=gr.Audio(type="filepath", label="Generated Audio"),
	title="PDF to Speech summary",
	description="Upload a PDF and get a spoken summary audio output."
	)

	demo.launch(debug=True, prevent_thread_lock=True, share=True)