Spaces:

barser65
/

assessment3

Build error

App Files Files Community

barser65 commited on Dec 11, 2023

Commit

403f739

1 Parent(s): 2addf3f

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -1

app.py CHANGED Viewed

@@ -1,5 +1,145 @@
-import myfunct
 import gradio as gr
 #def greet(name):

+def converti(path):
+    import pip
+    def install(package):
+        if hasattr(pip, 'main'):
+            pip.main(['install', package])
+        else:
+            pip._internal.main(['install', package])
+    install('git+https://github.com/huggingface/transformers.git')
+    install('datasets sentencepiece')
+    install('PyPDF2')
+    install('pdfminer.six')
+    install('pdfplumber')
+    install('poppler-utils')
+    install('tesseract-ocr')
+    install('libtesseract-dev')
+    # To read the PDF
+    import PyPDF2
+    # To analyze the PDF layout and extract text
+    from pdfminer.high_level import extract_pages, extract_text
+    from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
+    # To extract text from tables in PDF
+    import pdfplumber
+    # To remove the additional created files
+    import os
+    # Create a function to extract text
+    def text_extraction(element):
+        # Extracting the text from the in-line text element
+        line_text = element.get_text()
+        # Find the formats of the text
+        # Initialize the list with all the formats that appeared in the line of text
+        line_formats = []
+        for text_line in element:
+            if isinstance(text_line, LTTextContainer):
+                # Iterating through each character in the line of text
+                for character in text_line:
+                    if isinstance(character, LTChar):
+                        # Append the font name of the character
+                        line_formats.append(character.fontname)
+                        # Append the font size of the character
+                        line_formats.append(character.size)
+        # Find the unique font sizes and names in the line
+        format_per_line = list(set(line_formats))
+        # Return a tuple with the text in each line along with its format
+        return (line_text, format_per_line)
+    def read_pdf(pdf_path):
+      # create a PDF file object
+      pdfFileObj = open(pdf_path, 'rb')
+      # create a PDF reader object
+      pdfReaded = PyPDF2.PdfReader(pdfFileObj)
+      # Create the dictionary to extract text from each image
+      text_per_page = {}
+      # We extract the pages from the PDF
+      for pagenum, page in enumerate(extract_pages(pdf_path)):
+          print("Elaborating Page_" +str(pagenum))
+          # Initialize the variables needed for the text extraction from the page
+          pageObj = pdfReaded.pages[pagenum]
+          page_text = []
+          line_format = []
+          text_from_images = []
+          text_from_tables = []
+          page_content = []
+          # Initialize the number of the examined tables
+          table_num = 0
+          first_element= True
+          table_extraction_flag= False
+          # Open the pdf file
+          pdf = pdfplumber.open(pdf_path)
+          # Find the examined page
+          page_tables = pdf.pages[pagenum]
+          # Find the number of tables on the page
+          tables = page_tables.find_tables()
+          # Find all the elements
+          page_elements = [(element.y1, element) for element in page._objs]
+          # Sort all the elements as they appear in the page
+          page_elements.sort(key=lambda a: a[0], reverse=True)
+          # Find the elements that composed a page
+          for i,component in enumerate(page_elements):
+              # Extract the position of the top side of the element in the PDF
+              pos= component[0]
+              # Extract the element of the page layout
+              element = component[1]
+              # Check if the element is a text element
+              if isinstance(element, LTTextContainer):
+                  # Check if the text appeared in a table
+                  if table_extraction_flag == False:
+                      # Use the function to extract the text and format for each text element
+                      (line_text, format_per_line) = text_extraction(element)
+                      # Append the text of each line to the page text
+                      page_text.append(line_text)
+                      # Append the format for each line containing text
+                      line_format.append(format_per_line)
+                      page_content.append(line_text)
+                  else:
+                      # Omit the text that appeared in a table
+                      pass
+          # Create the key of the dictionary
+          dctkey = 'Page_'+str(pagenum)
+          # Add the list of list as the value of the page key
+          text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
+      # Closing the pdf file object
+      pdfFileObj.close()
+      return text_per_page
+    from google.colab import drive
+    drive.mount('/content/drive')
+    pdf_path = '/content/drive/MyDrive/' + path
+    text_per_page = read_pdf(pdf_path)
+    abstr = ''
+    while len(abstr) == 0:
+      for par in range(len(text_per_page)):
+        for x in text_per_page['Page_'+str(par)]:
+          mystring = ' '.join(map(str,x))
+          if mystring.find('Abstract\n') > 0:
+            abstr0 = mystring[mystring.find('Abstract\n')+10:]
+            abstr = abstr0[:abstr0.find('1\n')]
+    print(abstr)
+    from transformers import pipeline
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    summary = summarizer(abstr, max_length=56)
+    summary_text = summary[0]['summary_text']
+    return summary_text
 import gradio as gr
 #def greet(name):