Spaces:
Build error
Build error
Upload function.py
Browse files- function.py +111 -0
function.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Pkgs
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
from PyPDF2 import PdfFileReader
|
| 5 |
+
import docx2txt
|
| 6 |
+
import base64
|
| 7 |
+
import re
|
| 8 |
+
import sqlite3
|
| 9 |
+
import time
|
| 10 |
+
from io import StringIO
|
| 11 |
+
import warnings
|
| 12 |
+
warnings.filterwarnings("ignore")
|
| 13 |
+
|
| 14 |
+
time_str = time.strftime("%Y%m%d-%H%M%S")
|
| 15 |
+
# Loading function the model pipeline from huggingface model
|
| 16 |
+
@st.cache(allow_output_mutation=True)
|
| 17 |
+
def bart():
|
| 18 |
+
''' Loading bart model using pipeline api '''
|
| 19 |
+
summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
|
| 20 |
+
return summarizer
|
| 21 |
+
|
| 22 |
+
def t5():
|
| 23 |
+
''' Loading t5 model using pipeline api '''
|
| 24 |
+
summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base")
|
| 25 |
+
return summarizer
|
| 26 |
+
|
| 27 |
+
def bart_t5():
|
| 28 |
+
''' Loading bart_t5 model using pipeline api '''
|
| 29 |
+
summarizer = pipeline("summarization", model="tuner007/pegasus_summarizer")
|
| 30 |
+
return summarizer
|
| 31 |
+
|
| 32 |
+
# def pegasus():
|
| 33 |
+
# ''' Loading pegasus model using pipeline api '''
|
| 34 |
+
# summarizer = pipeline('summarization',model='google/pegasus-xsum')
|
| 35 |
+
# return summarizer
|
| 36 |
+
|
| 37 |
+
def preprocess_plain_text(x):
|
| 38 |
+
|
| 39 |
+
x = x.encode("ascii", "ignore").decode() # unicode
|
| 40 |
+
x = re.sub(r"https*\S+", " ", x) # url
|
| 41 |
+
x = re.sub(r"@\S+", " ", x) # mentions
|
| 42 |
+
x = re.sub(r"#\S+", " ", x) # hastags
|
| 43 |
+
x = re.sub(r"\s{2,}", " ", x) # over spaces
|
| 44 |
+
x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
|
| 45 |
+
|
| 46 |
+
return x
|
| 47 |
+
|
| 48 |
+
def extract_pdf(file):
|
| 49 |
+
|
| 50 |
+
'''Extract text from PDF file'''
|
| 51 |
+
|
| 52 |
+
pdfReader = PdfFileReader(file)
|
| 53 |
+
count = pdfReader.numPages
|
| 54 |
+
all_text = ""
|
| 55 |
+
for i in range(count):
|
| 56 |
+
page = pdfReader.getPage(i)
|
| 57 |
+
all_text += page.extractText()
|
| 58 |
+
|
| 59 |
+
return all_text
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def extract_text_from_file(file):
|
| 63 |
+
|
| 64 |
+
'''Extract text from uploaded file'''
|
| 65 |
+
|
| 66 |
+
# read text file
|
| 67 |
+
if file.type == "text/plain":
|
| 68 |
+
# To convert to a string based IO:
|
| 69 |
+
stringio = StringIO(file.getvalue().decode("utf-8"))
|
| 70 |
+
|
| 71 |
+
# To read file as string:
|
| 72 |
+
file_text = stringio.read()
|
| 73 |
+
|
| 74 |
+
# read pdf file
|
| 75 |
+
elif file.type == "application/pdf":
|
| 76 |
+
file_text = extract_pdf(file)
|
| 77 |
+
|
| 78 |
+
# read docx file
|
| 79 |
+
elif (
|
| 80 |
+
file.type
|
| 81 |
+
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
| 82 |
+
):
|
| 83 |
+
file_text = docx2txt.process(file)
|
| 84 |
+
|
| 85 |
+
return file_text
|
| 86 |
+
|
| 87 |
+
def summary_downloader(raw_text):
|
| 88 |
+
|
| 89 |
+
b64 = base64.b64encode(raw_text.encode()).decode()
|
| 90 |
+
new_filename = "new_text_file_{}_.txt".format(time_str)
|
| 91 |
+
st.markdown("#### Download Summary as a File ###")
|
| 92 |
+
href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
|
| 93 |
+
st.markdown(href,unsafe_allow_html=True)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# Storage in A Database
|
| 97 |
+
conn = sqlite3.connect('summarizer_database.db',check_same_thread=False)
|
| 98 |
+
c = conn.cursor()
|
| 99 |
+
# Create Fxn From SQL
|
| 100 |
+
def create_table():
|
| 101 |
+
c.execute('CREATE TABLE IF NOT EXISTS TextTable(text_to_summarize TEXT,summarized_text TEXT,postdate DATE)')
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def add_data(text_to_summarize,summarized_text,postdate):
|
| 105 |
+
c.execute('INSERT INTO TextTable(text_to_summarize,summarized_text,postdate) VALUES (?,?,?)',(text_to_summarize,summarized_text,postdate))
|
| 106 |
+
conn.commit()
|
| 107 |
+
|
| 108 |
+
def view_all_data():
|
| 109 |
+
c.execute("SELECT * FROM TextTable")
|
| 110 |
+
data = c.fetchall()
|
| 111 |
+
return data
|