Palak Deb Patra commited on
Commit
8c3af37
·
verified ·
1 Parent(s): 6349662

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -0
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """TEXT SUMMARIZATION Web APP"""
2
+
3
+ # Importing Packages
4
+ import base64
5
+ import streamlit as st
6
+ import torch
7
+ from langchain.document_loaders import PyPDFLoader
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
10
+ from transformers import pipeline
11
+
12
+ # Load the tokenizer and model
13
+ checkpoint = 'Lamini-1'
14
+ tokenizer = T5Tokenizer.from_pretrained(checkpoint)
15
+ base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float32)
16
+
17
+
18
+ # File Loader & Processing
19
+ def file_processing(file):
20
+ loader = PyPDFLoader(file)
21
+ pages = loader.load_and_split()
22
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
23
+ texts = text_splitter.split_documents(pages)
24
+ final_texts = ""
25
+ for text in texts:
26
+ print(text)
27
+ final_texts = final_texts + text.page_content
28
+ return final_texts
29
+
30
+
31
+ # Language Model Pipeline -> Summarization
32
+ def llm_pipeline(filepath, summary_length):
33
+ pipe_summ = pipeline(
34
+ "summarization",
35
+ model=base_model, # T5ForConditionalGeneration.from_pretrained(checkpoint),
36
+ tokenizer=tokenizer, # T5Tokenizer.from_pretrained(checkpoint),
37
+ max_length=summary_length,
38
+ min_length=50,
39
+ )
40
+ input = file_processing(filepath)
41
+ result = pipe_summ(input)
42
+ result = result[0]["summary_text"]
43
+ return result
44
+
45
+
46
+ # Streamlit Code
47
+ st.set_page_config(layout="wide")
48
+
49
+
50
+ # Display Background
51
+ def add_bg_from_local(image_file):
52
+ with open(image_file, "rb") as image_file:
53
+ encoded_string = base64.b64encode(image_file.read())
54
+ st.markdown(
55
+ f"""
56
+ <style>
57
+ .stApp {{
58
+ background-image: url(data:image/{"png"};base64,{encoded_string.decode()});
59
+ background-size: cover;
60
+ opacity:0.9;
61
+ }}
62
+ </style>
63
+ """,
64
+ unsafe_allow_html=True,
65
+ )
66
+
67
+
68
+ add_bg_from_local("Images/background.jpg")
69
+
70
+ # Font Style
71
+ with open("font.css") as f:
72
+ st.markdown("<style>{}</style>".format(f.read()), unsafe_allow_html=True)
73
+
74
+ # Sidebar
75
+ st.sidebar.image("Images/sidebar_pic.png")
76
+ st.sidebar.title("ABOUT THE APP")
77
+ st.sidebar.write(
78
+ "SummaScribe: Your PDF wingman! 🚀 Unleash the power of Streamlit and LangChain to transform boring text PDFs into "
79
+ "snappy summaries. Lightning-fast processing,ninja-level NLP algorithms, and a touch of magic—making info "
80
+ "extraction a breeze!"
81
+ )
82
+ selected_summary_length = st.sidebar.slider("SELECT SUMMARY STRENGTH", min_value=50, max_value=1000,
83
+ value=500)
84
+
85
+
86
+ # Display pdf of a given file
87
+ @st.cache_data
88
+ def display(file):
89
+ # Opening file from filepath
90
+ with open(file, "rb") as f:
91
+ base64_pdf = base64.b64encode(f.read()).decode("utf-8")
92
+ # Embedding pdf in html
93
+ display_pdf = (
94
+ f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="500" '
95
+ f'type="application/pdf"></iframe>'
96
+ )
97
+ # Displaying File
98
+ st.markdown(display_pdf, unsafe_allow_html=True)
99
+
100
+
101
+ # Main content
102
+ st.markdown(
103
+ """
104
+ <style>
105
+ .summascribe-title {
106
+ font-size: 57px;
107
+ text-align: center;
108
+ transition: transform 0.2s ease-in-out;
109
+ }
110
+ .summascribe-title span {
111
+ transition: color 0.2s ease-in-out;
112
+ }
113
+ .summascribe-title:hover span {
114
+ color: #f5fefd; /* Hover color */
115
+ }
116
+ .summascribe-title:hover {
117
+ transform: scale(1.15);
118
+ }
119
+ </style>
120
+ """,
121
+ unsafe_allow_html=True,
122
+ )
123
+
124
+ text = "SummaScribe" # Text to be styled
125
+ colored_text = ''.join(
126
+ ['<span style="color: hsl(220, 60%, {}%);">{}</span>'.format(70 - (i * 10 / len(text)), char) for i, char in
127
+ enumerate(text)])
128
+ colored_text_with_malt = colored_text + ' <span style="color: hsl(220, 60%, 70%);">&#x2727;</span>'
129
+ st.markdown(f'<h1 class="summascribe-title">{colored_text_with_malt}</h1>', unsafe_allow_html=True)
130
+
131
+ st.markdown(
132
+ '<h2 style="font-size:30px;color: #F5FEFD; text-align: center;">Text Document Summarization using LLMs</h2>',
133
+ unsafe_allow_html=True,
134
+ )
135
+
136
+
137
+ # Your Streamlit app content here...
138
+ def main():
139
+ # st.title("SUMMASCRIBE")
140
+ # st.subheader("Text Document Summarization using Large Language Models")
141
+ uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
142
+ with st.expander("NOTE"):
143
+ st.write(
144
+ "Summascribe currently accepts PDF documents that contain only text and no images. This limitation is due "
145
+ "to our app's current focus on leveraging advanced natural language processing (NLP) algorithms to "
146
+ "extract key information from textual content."
147
+ )
148
+ if uploaded_file is not None:
149
+ if st.button("Summarize"):
150
+ col1, col2 = st.columns((1, 1))
151
+ filepath = "data/" + uploaded_file.name
152
+ with open(filepath, "wb") as temp_file:
153
+ temp_file.write(uploaded_file.read())
154
+ with col1:
155
+ st.info("Uploaded File")
156
+ display(filepath)
157
+ with col2:
158
+ st.spinner(text="In progress...")
159
+ st.info("Summary")
160
+ summary = llm_pipeline(filepath, selected_summary_length)
161
+ st.success(summary, icon="✅")
162
+
163
+
164
+ if __name__ == "__main__":
165
+ main()