| | import gradio as gr |
| | from gtts import gTTS |
| | import pdfplumber |
| | from sumy.parsers.plaintext import PlaintextParser |
| | from sumy.nlp.tokenizers import Tokenizer |
| | from sumy.summarizers.lsa import LsaSummarizer |
| | import nltk |
| | import os |
| |
|
| | |
| | try: |
| | nltk.download('punkt') |
| | nltk.download('punkt_tab') |
| | except Exception as e: |
| | print(f"Error downloading NLTK data: {str(e)}") |
| |
|
| | def extract_text_from_pdf(pdf_file): |
| | """ |
| | Extract text from a PDF file using pdfplumber. |
| | |
| | Args: |
| | pdf_file: Uploaded PDF file. |
| | |
| | Returns: |
| | str: Extracted text or error message. |
| | """ |
| | try: |
| | with pdfplumber.open(pdf_file) as pdf: |
| | text = "" |
| | for page in pdf.pages: |
| | page_text = page.extract_text() |
| | if page_text: |
| | text += page_text + " " |
| | return text.strip() if text else "No text could be extracted from the PDF." |
| | except Exception as e: |
| | return f"Error extracting text: {str(e)}" |
| |
|
| | def summarize_text(text, sentences_count=12): |
| | """ |
| | Summarize text to approximately four paragraphs using sumy LSA summarizer. |
| | |
| | Args: |
| | text (str): Text to summarize. |
| | sentences_count (int): Number of sentences in summary (approx. 3 sentences per paragraph). |
| | |
| | Returns: |
| | str: Summarized text or error message. |
| | """ |
| | try: |
| | if len(text.split()) < 50: |
| | return "Text is too short to summarize." |
| | parser = PlaintextParser.from_string(text, Tokenizer("english")) |
| | summarizer = LsaSummarizer() |
| | summary = summarizer(parser.document, sentences_count) |
| | summary_text = "" |
| | for i, sentence in enumerate(summary): |
| | summary_text += str(sentence) + " " |
| | if (i + 1) % 3 == 0: |
| | summary_text += "\n\n" |
| | return summary_text.strip() if summary_text else "No summary generated." |
| | except Exception as e: |
| | return f"Error summarizing text: {str(e)}" |
| |
|
| | def pdf_to_speech(pdf_file, lang="en"): |
| | """ |
| | Convert text from a PDF to summarized speech using gTTS. |
| | |
| | Args: |
| | pdf_file: Uploaded PDF file. |
| | lang (str): Language code (default is 'en' for English). |
| | |
| | Returns: |
| | tuple: (Path to audio file or None, summarized text or error message). |
| | """ |
| | try: |
| | |
| | text = extract_text_from_pdf(pdf_file) |
| | if "Error" in text: |
| | return None, text |
| | |
| | |
| | summarized_text = summarize_text(text, sentences_count=12) |
| | if "Error" in summarized_text or "too short" in summarized_text: |
| | return None, summarized_text |
| | |
| | |
| | tts = gTTS(text=summarized_text, lang=lang, slow=False) |
| | |
| | |
| | output_file = "output.mp3" |
| | tts.save(output_file) |
| | |
| | return output_file, summarized_text |
| | |
| | except Exception as e: |
| | return None, f"An error occurred: {str(e)}" |
| |
|
| | |
| | demo = gr.Interface( |
| | fn=pdf_to_speech, |
| | inputs=[ |
| | gr.File(label="Upload a PDF file", file_types=[".pdf"]), |
| | gr.Dropdown(choices=["en", "es", "fr"], label="Select Language", value="en") |
| | ], |
| | outputs=[ |
| | gr.Audio(label="Generated Speech"), |
| | gr.Textbox(label="Summarized Text") |
| | ], |
| | title="PDF Summary to Speech", |
| | description="Upload an English PDF file, select a language, and generate speech from a summarized version (approx. 4 paragraphs). The summarized text is also displayed." |
| | ) |
| |
|
| | |
| | if __name__ == "__main__": |
| | demo.launch() |