Spaces:

Gladiator
/

Text-Summarizer

Runtime error

Gladiator commited on Jan 24, 2022

Commit

12c89e3

1 Parent(s): 3ebd8ef

addd support for .txt, .docx, .pdf files

Files changed (2) hide show

test.py ADDED Viewed

+import docx2txt
+import streamlit as st
+from io import StringIO
+from PyPDF2 import PdfFileReader
+def read_pdf(file):
+    pdfReader = PdfFileReader(file)
+    count = pdfReader.numPages
+    all_page_text = ""
+    for i in range(count):
+        page = pdfReader.getPage(i)
+        all_page_text += page.extractText()
+    return all_page_text
+if __name__ == "__main__":
+    st.header("Testing file uploads")
+    uploaded_file = st.file_uploader("Upload a file here")
+    st.write(uploaded_file.type)
+    docx_text = docx2txt.process(uploaded_file)
+    st.write(docx_text)

utils.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import re
 import requests
 from bs4 import BeautifulSoup
 from nltk.tokenize import sent_tokenize
@@ -98,10 +102,36 @@ def preprocess_text_for_abstractive_summarization(tokenizer, text):
     return chunks
 def read_text_from_file(file):
-    # txt_file = open(file, "r")
-    file_text = file.read()
-    # txt_file.close()
-    return file_text

 import re
 import requests
+import docx2txt
+from io import StringIO
+from PyPDF2 import PdfFileReader
 from bs4 import BeautifulSoup
 from nltk.tokenize import sent_tokenize
     return chunks
+def read_pdf(file):
+    pdfReader = PdfFileReader(file)
+    count = pdfReader.numPages
+    all_page_text = ""
+    for i in range(count):
+        page = pdfReader.getPage(i)
+        all_page_text += page.extractText()
+    return all_page_text
 def read_text_from_file(file):
+    # read text file
+    if file.type == "text/plain":
+        # To convert to a string based IO:
+        stringio = StringIO(file.getvalue().decode("utf-8"))
+        # To read file as string:
+        file_content = stringio.read()
+    # read pdf file
+    elif file.type == "application/pdf":
+        file_content = read_pdf(file)
+    # read docx file
+    elif (
+        file.type
+        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    ):
+        file_content = docx2txt(file)
+    return file_content