Spaces:

vividsd
/

practice

Build error

App Files Files Community

vividsd commited on Dec 11, 2023

Commit

b16f8b5

1 Parent(s): 6e412f0

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -178

app.py CHANGED Viewed

@@ -2,206 +2,76 @@
 # I tried to use my previous code but with some adaptions to any PDF that contains an abstract
-import gradio as gr
-from transformers import pipeline
-from tempfile import NamedTemporaryFile
 import PyPDF2
-from PyPDF2 import PdfReader
-from pdfminer.high_level import extract_pages, extract_text
-import pdfplumber
-from PIL import Image
-from pdf2image import convert_from_path
-from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
-import pytesseract
-import os
-import numpy as np
-import torch
-import sentencepiece
-import soundfile as sf
-from IPython.display import Audio
 from datasets import load_dataset
 from transformers import SpeechT5HifiGan
-def read_pdf(pdf_path):
-  # create a PDF file object
-  pdfFileObj = open(pdf_path, 'rb')
-  # create a PDF reader object
-  pdfReaded = PyPDF2.PdfReader(pdfFileObj)
-  # Create the dictionary to extract text from each image
-  text_per_page = {}
-  # We extract the pages from the PDF
-  for pagenum, page in enumerate(extract_pages(pdf_path)):
-      print("Elaborating Page_" +str(pagenum))
-      # Initialize the variables needed for the text extraction from the page
-      pageObj = pdfReaded.pages[pagenum]
-      page_text = []
-      line_format = []
-      text_from_images = []
-      text_from_tables = []
-      page_content = []
-      # Initialize the number of the examined tables
-      table_num = 0
-      first_element= True
-      table_extraction_flag= False
-      # Open the pdf file
-      pdf = pdfplumber.open(pdf_path)
-      # Find the examined page
-      page_tables = pdf.pages[pagenum]
-      # Find the number of tables on the page
-      tables = page_tables.find_tables()
-      # Find all the elements
-      page_elements = [(element.y1, element) for element in page._objs]
-      # Sort all the elements as they appear in the page
-      page_elements.sort(key=lambda a: a[0], reverse=True)
-      # Find the elements that composed a page
-      for i,component in enumerate(page_elements):
-          # Extract the position of the top side of the element in the PDF
-          pos= component[0]
-          # Extract the element of the page layout
-          element = component[1]
-          # Check if the element is a text element
-          if isinstance(element, LTTextContainer):
-              # Check if the text appeared in a table
-              if table_extraction_flag == False:
-                  # Use the function to extract the text and format for each text element
-                  (line_text, format_per_line) = text_extraction(element)
-                  # Append the text of each line to the page text
-                  page_text.append(line_text)
-                  # Append the format for each line containing text
-                  line_format.append(format_per_line)
-                  page_content.append(line_text)
-              else:
-                  # Omit the text that appeared in a table
-                  pass
-          # Check the elements for images
-          if isinstance(element, LTFigure):
-              # Crop the image from the PDF
-              crop_image(element, pageObj)
-              # Convert the cropped pdf to an image
-              convert_to_images('cropped_image.pdf')
-              # Extract the text from the image
-              image_text = image_to_text('PDF_image.png')
-              text_from_images.append(image_text)
-              page_content.append(image_text)
-              # Add a placeholder in the text and format lists
-              page_text.append('image')
-              line_format.append('image')
-          # Check the elements for tables
-          if isinstance(element, LTRect):
-              # If the first rectangular element
-              if first_element == True and (table_num+1) <= len(tables):
-                  # Find the bounding box of the table
-                  lower_side = page.bbox[3] - tables[table_num].bbox[3]
-                  upper_side = element.y1
-                  # Extract the information from the table
-                  table = extract_table(pdf_path, pagenum, table_num)
-                  # Convert the table information in structured string format
-                  table_string = table_converter(table)
-                  # Append the table string into a list
-                  text_from_tables.append(table_string)
-                  page_content.append(table_string)
-                  # Set the flag as True to avoid the content again
-                  table_extraction_flag = True
-                  # Make it another element
-                  first_element = False
-                  # Add a placeholder in the text and format lists
-                  page_text.append('table')
-                  line_format.append('table')
-                  # Check if we already extracted the tables from the page
-                  if element.y0 >= lower_side and element.y1 <= upper_side:
-                      pass
-                  elif not isinstance(page_elements[i+1][1], LTRect):
-                      table_extraction_flag = False
-                      first_element = True
-                      table_num+=1
-      # Create the key of the dictionary
-      dctkey = 'Page_'+str(pagenum)
-      # Add the list of list as the value of the page key
-      text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
-  # Closing the pdf file object
-  pdfFileObj.close()
-  return text_per_page
-pdf_path = pdf_file.name
-text_per_page = read_pdf(pdf_path)
-page_0 = text_per_page['Page_0']
-page_0_clean = [item for sublist in page_0 for item in sublist if isinstance(item, str)]
-for i in range(len(page_0_clean)):
-    page_0_clean[i] = page_0_clean[i].replace('\n', ' ').strip()
-#intead of cleaning the exact position as I did in my previous code, since I don't know it, then I try to identify the section of the abstract
-abstract = 'abstract'
-found_abstract = False
-intro_string ='introduction'
-extracted_abstract =""
-extracted_abstract = extracted_text_string.replace("Abstract", "")
-file = text.splitlines()
-    for lines in file:
-      lower_lines = lines.lower()
-      if lower_lines.strip()== abstract:
-        found_abstract = True
-      elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
-        found_abstract = False
-#summarizing the abstract
 from transformers import pipeline
 summarizer = pipeline("summarization", model="Falconsai/text_summarization")
-text1 = extracted_abstract
-print(summarizer(text1, max_length=20, min_length=10, do_sample=False))
-#in here, I try to save it differently, since on my previous code I had copied and pasted the summary and in here I don't know
-sentence = summarized_text[0]['summary_text']
-# generating the audio of the output by using my previous code
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-text = sentence
-inputs = processor(text=sentence, return_tensors="pt")
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-with torch.no_grad():
     speech = vocoder(spectrogram)
 speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
 Audio(speech, rate=16000)
 # Creating the Gradio app
 input_component = gr.File(file_types=["pdf"])
 output_component = gr.Audio()
 demo = gr.Interface(
-    fn=read_pdf,
     inputs=input_component,
     outputs=output_component,
     title="Reading your abstract summary outloud",
-    description="Upload a PDF that contains an Abstract. Get your abstract summarized in 1 sentence and read outloud. We only accept with PDfs that contains the section Abstract"
 )
 demo.launch()

 # I tried to use my previous code but with some adaptions to any PDF that contains an abstract
+#imports
 import PyPDF2
+from transformers import pipeline
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
 from datasets import load_dataset
+import torch
 from transformers import SpeechT5HifiGan
+from gradio import gr
+import gradio as gr
+# Now copying my code and adapting it for any PDF
+def extract_abstract(paper_filename):
+    with open(paper_filename, 'rb') as file:
+        reader = PyPDF2.PdfReader(file)
+        text = reader.pages[0].extract_text()
+        # in order to extract the exact part on the first page that is useful to me,
+        # I needed to consider that the papers follow a pattern in which after the Abstract, there is an Introduction
+        # and then cut the text right before the introduction
+        abstract_start_index = text.find('Abstract')
+        introduction_start_index = text.find('Introduction')
+        if abstract_start_index == -1 or introduction_start_index == -1:
+            return ""  # Abstract or introduction section not found
+        abstract = text[abstract_start_index + len('Abstract'):introduction_start_index].strip()
+        return abstract
+    return ""
+paper_filename = '/content/Article_11'
+abstract_text = extract_abstract(paper_filename)
+print(abstract_text)
 from transformers import pipeline
 summarizer = pipeline("summarization", model="Falconsai/text_summarization")
+print(summarizer(abstract_text, max_length=25, min_length=10, do_sample=False))
+output = summarizer(abstract_text, max_length=26, min_length=10, do_sample=False)
+summary = output[0]['summary_text']
+print(summary)
+# proceeding to the audio function
+def audio(text):
+  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+  summary
+  inputs = processor(text=summary, return_tensors="pt")
+  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+  spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
+  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+  with torch.no_grad():
     speech = vocoder(spectrogram)
 speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
 Audio(speech, rate=16000)
 # Creating the Gradio app
 input_component = gr.File(file_types=["pdf"])
 output_component = gr.Audio()
 demo = gr.Interface(
+    fn=audio,
     inputs=input_component,
     outputs=output_component,
     title="Reading your abstract summary outloud",
+    description="Upload a PDF that contains an Abstract. Get your abstract summarized in 1 sentence and read outloud. We only accept with PDfs that contains the section Abstract followed by one called Introduction"
 )
 demo.launch()