Spaces:
Sleeping
Sleeping
File size: 4,937 Bytes
19977fc 0853bbb fef1930 4defedc 0853bbb fef1930 0853bbb 4defedc 0853bbb fef1930 4defedc 0853bbb fef1930 19977fc fef1930 e6f2f04 fef1930 abf3636 fef1930 19977fc fef1930 e6f2f04 fef1930 abf3636 20c1d8c fef1930 abf3636 fef1930 abf3636 fef1930 e6f2f04 fef1930 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import docx
import streamlit as st
import os
import PyPDF2
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
checkpoint = "facebook/bart-large-cnn"
@st.cache_resource
def load_model():
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
return model
@st.cache_resource
def load_tokenizer():
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
return tokenizer
def load_text_file(file):
bytes_data = file.getvalue()
text = bytes_data.decode("utf-8")
return text
def load_pdf_file(file):
pdf_reader = PyPDF2.PdfReader(file)
pdf_text = ""
for page_num in range(len(pdf_reader.pages)):
pdf_text += pdf_reader.pages[page_num].extract_text() or ""
return pdf_text
def load_word_file(file):
doc = docx.Document(file)
paragraphs = [p.text for p in doc.paragraphs]
return "\n".join(paragraphs)
def split_text_into_chunks(text, max_chunk_length):
chunks = []
current_chunk = ""
for word in text.split():
if len(current_chunk) + len(word) + 1 <= max_chunk_length:
current_chunk += word + " "
else:
chunks.append(current_chunk.strip())
current_chunk = word + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def main():
st.set_page_config(
page_title="Summarisation Tool",
page_icon="🧊",
layout="wide",
initial_sidebar_state="expanded",
)
model = load_model()
print("Model's maximum sequence length:", model.config.max_position_embeddings)
tokenizer = load_tokenizer()
print("Tokenizer's maximum sequence length:", tokenizer.model_max_length)
st.title("Summarisation Tool")
st.write(
f"Performs basic summarisation of text and audio using the '{checkpoint}' model."
)
st.sidebar.title("Options")
summary_balance = st.sidebar.select_slider(
"Output Summarisation Detail:",
options=["concise", "balanced", "detailed"],
value="balanced",
)
textTab, docTab, audioTab = st.tabs(["Plain Text", "Text Document", "Audio File"])
with textTab:
sentence = st.text_area(
"Paste text to be summarised:",
help="Paste text into text area and hit Summarise button",
height=300,
)
st.write(f"{len(sentence)} characters and {len(sentence.split())} words")
with docTab:
uploaded_file = st.file_uploader("Select a file to be summarised:")
if uploaded_file is not None:
file_name = os.path.basename(uploaded_file.name)
_, file_ext = os.path.splitext(file_name)
if "pdf" in file_ext:
sentence = load_pdf_file(uploaded_file)
elif "docx" in file_ext:
sentence = load_word_file(uploaded_file)
else:
sentence = load_text_file(uploaded_file)
st.write(f"{len(sentence)} characters and {len(sentence.split())} words")
# st.write(sentence)
with audioTab:
st.text("Yet to be implemented...")
button = st.button("Summarise")
st.divider()
with st.spinner("Generating Summary..."):
if button and sentence:
chunks = split_text_into_chunks(sentence, 100000)
print(f"Split into {len(chunks)} chunks")
text_words = len(sentence.split())
if summary_balance == "concise":
min_multiplier = text_words * 0.1
max_multiplier = text_words * 0.3
elif summary_balance == "detailed":
min_multiplier = text_words * 0.5
max_multiplier = text_words * 0.8
elif summary_balance == "balanced":
min_multiplier = text_words * 0.2
max_multiplier = text_words * 0.4
if max_multiplier > 1024:
max_multiplier = 1024
min_multiplier = 512
print(
f"Tokenizer min tokens {int(min_multiplier)}, max tokens {int(max_multiplier)}"
)
inputs = tokenizer(
chunks,
max_length=model.config.max_position_embeddings,
return_tensors="pt",
truncation=True,
padding=True,
)
summary_ids = model.generate(
inputs["input_ids"],
min_new_tokens=int(min_multiplier),
max_new_tokens=int(max_multiplier),
do_sample=False,
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
st.write(summary)
st.write(f"{len(summary)} characters and {len(summary.split())} words")
if __name__ == "__main__":
main()
|