zanesmit29
Fixed typo in notebook and removed unnecessary lines in app.py
62752c5
import gradio as gr
import torch
import soundfile as sf
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import pdfplumber
import tqdm
import soundfile as sf
# Initialize summarization pipeline globally (load once)
summarizer = pipeline("summarization", model="t5-large")
def extract_abstract(path):
with pdfplumber.open(path) as pdf:
text = ""
for page in tqdm.tqdm(pdf.pages):
txt = page.extract_text(x_tolerance=2)
if txt:
text += txt
abstract_start = text.find("Abstract")
if text.find("Introduction") != -1:
abstract_end = text.find("Introduction")
else:
abstract_end = text.find("Contents")
abstract = text[abstract_start+len("Abstract"):abstract_end].strip()
return abstract
def summarize_text(text):
# Add prompt to encourage single sentence summary
prompt = "Summarize the following text in one sentence: " + text
result = summarizer(prompt, max_length=68, min_length=40, do_sample=False, num_beams=1, early_stopping=True)
summary = result[0]['summary_text'].replace(" .", ",").replace(" . ", ", ").strip()
if not summary.endswith("."):
summary += "."
return summary
def pdf_to_speech(file):
# Gradio file input provides file.name as local path
abstract = extract_abstract(file.name)
summary = summarize_text(abstract)
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=summary, return_tensors="pt")
speaker_embeddings = torch.zeros(1, 512) # Neutral speaker embedding
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
speech_np = speech.squeeze().cpu().numpy()
wav_output_path = "output.wav"
sf.write(wav_output_path, speech_np, 16000)
# Return path of wav file for Gradio audio output and download
return wav_output_path
# Create Gradio interface with file input and audio output
demo = gr.Interface(
fn=pdf_to_speech,
inputs=gr.File(file_types=[".pdf"]),
outputs=gr.Audio(type="filepath", label="Generated Audio"),
title="PDF to Speech summary",
description="Upload a PDF and get a spoken summary audio output."
)
demo.launch(debug=True, prevent_thread_lock=True, share=True)