AkashKhamkar commited on
Commit
7151a6c
·
1 Parent(s): cd5b7b2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +323 -0
app.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sentence_transformers
3
+ from transformers import AutoTokenizer
4
+ from youtube_transcript_api import YouTubeTranscriptApi
5
+ import os
6
+ import ast
7
+ import pandas as pd
8
+ import before_run
9
+ from segmentation import SemanticTextSegmentation
10
+ import re
11
+ from symspellpy import SymSpell
12
+ import pkg_resources
13
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
14
+ from torch import cuda
15
+ from transformers import pipeline
16
+ from PIL import Image
17
+ from PIL import ImageDraw
18
+ from PIL import ImageFont
19
+
20
+
21
+ if not os.path.exists('C:/Users/akash/OneDrive/Documents/New folder/virtual_envs/streamlit_app/transcripts/'):
22
+ os.mkdir('C:/Users/akash/OneDrive/Documents/New folder/virtual_envs/streamlit_app/transcripts/')
23
+ device = 'cuda' if cuda.is_available() else 'cpu'
24
+
25
+ def clean_text(link,start,end):
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained("t5-base")
28
+ sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
29
+ dictionary_path = pkg_resources.resource_filename(
30
+ "symspellpy", "frequency_dictionary_en_82_765.txt"
31
+ )
32
+ sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
33
+
34
+ def id_ts_grabber(link):
35
+ youtube_video = link.split("=")
36
+ video_id = youtube_video[1]
37
+ #print(f""" This is the video ID: {video_id} and this is the Timestamp: {time_stamp}""")
38
+ return video_id
39
+ #print(f""" This is the video ID: {video_id} and no Timestamp was found""")
40
+
41
+ def seg_getter(data,ts,es):
42
+ starts = []
43
+ for line in data:
44
+ ccs = ast.literal_eval(line)
45
+ starts.append(float(ccs['start']))
46
+ #print(starts)
47
+ #ts_ = float(ts.strip("s&end"))
48
+ #es_ = float(es.strip(es[-1]))
49
+ st.write('this is the value of es: ',es)
50
+ if not(es) :
51
+ e_val = starts[-1]
52
+ else:
53
+ e_val = starts[min(range(len(starts)), key = lambda i: abs(starts[i]-float(es)))]
54
+
55
+ t_val = starts[min(range(len(starts)), key = lambda i: abs(starts[i]-float(ts)))]
56
+ tid = starts.index(t_val)
57
+ eid = starts.index(e_val)
58
+ ts_list_len = len(starts[tid:eid])
59
+ return tid, ts_list_len
60
+
61
+
62
+ def get_cc(video_id):
63
+ try:
64
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
65
+ try:
66
+ # filter for manually created transcripts
67
+ transcript = transcript_list.find_manually_created_transcript(['en','en-US','en-GB','en-IN'])
68
+ except Exception as e:
69
+ # print(e)
70
+ transcript = None
71
+
72
+ manual = True
73
+ if not transcript:
74
+ try:
75
+ # or automatically generated ones
76
+ transcript = transcript_list.find_generated_transcript(['en'])
77
+ manual = False
78
+ except Exception as e:
79
+ # print(e)
80
+ transcript = None
81
+
82
+ if transcript:
83
+ if manual: file_name = os.path.join('transcripts', str(video_id) + "_cc_manual" + ".txt")
84
+ else: file_name = os.path.join('transcripts', str(video_id) + "_cc_auto" + ".txt")
85
+ with open(file_name, 'w') as file:
86
+ for line in transcript.fetch():
87
+ file.write(str(line).replace(r'\xa0', ' ').replace(r'\n', '') + '\n')
88
+ # print(f"CC downloaded in {file_name}")
89
+ return file_name
90
+ else:
91
+ #print("No transcript found")
92
+ return None
93
+
94
+ except Exception as e:
95
+ #print(e)
96
+ return None
97
+
98
+ def transcript_creator(filename,timestamp,end_pt):
99
+ #print(filename)
100
+ with open(filename, 'r') as f:
101
+ data = f.readlines()
102
+ #print("This is data: ", data)
103
+ transcripts = []
104
+ #print("this is ts: ",timestamp)
105
+ if not(timestamp) and not(end_pt):
106
+ #print("executing 1 ")
107
+ for line in data:
108
+ ccs = ast.literal_eval(line)
109
+ transcripts.append(ccs['text'])
110
+ return transcripts
111
+
112
+ elif not(timestamp) and end_pt :
113
+ timestamp = 0
114
+ start,lenlist = seg_getter(data, timestamp, end_pt)
115
+
116
+ for t in range(lenlist):
117
+ ccs = ast.literal_eval(data[start+t])
118
+ transcripts.append(ccs['text'])
119
+ return transcripts
120
+
121
+
122
+ else :
123
+ #print("executing 2")
124
+ start,lenlist = seg_getter(data,timestamp,end_pt)
125
+ #print(f""" This is the ts list{ts_len}""")
126
+ for t in range(lenlist):
127
+ ccs = ast.literal_eval(data[start+t])
128
+ transcripts.append(ccs['text'])
129
+ return transcripts
130
+
131
+ def transcript_collector(link,ts,es):
132
+ vid = id_ts_grabber(link)
133
+ print(f""" Fetching the transcript """)
134
+ filename = get_cc(vid)
135
+ return transcript_creator(filename, ts, es), vid
136
+
137
+ transcript = pd.DataFrame(columns=['text', 'video_id'])
138
+ transcript.loc[0,'text'],transcript.loc[0,'video_id'] = transcript_collector(link,start,end)
139
+
140
+ def segment(corpus):
141
+ text_data = [re.sub(r'\[.*?\]', '', x).strip() for x in corpus]
142
+ text_data = [x for x in text_data if x != '']
143
+ df = pd.DataFrame(text_data, columns=["utterance"])
144
+ # remove new line, tab, return
145
+ df["utterance"] = df["utterance"].apply(lambda x: x.replace("\n", " ").replace("\r", " ").replace("\t", " "))
146
+ # remove Nan
147
+ df.dropna(inplace=True)
148
+ sts = SemanticTextSegmentation(df)
149
+ texts = sts.get_segments()
150
+ return texts
151
+
152
+ sf = pd.DataFrame(columns=['Segmented_Text','video_id'])
153
+
154
+ text = segment(transcript.at[0,'text'])
155
+ for i in range(len(text)):
156
+ sf.loc[i, 'Segmented_Text'] = text[i]
157
+ sf.loc[i, 'video_id'] = transcript.at[0,'video_id']
158
+
159
+ def word_seg(text):
160
+ text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\xa0", " ")
161
+ results = sym_spell.word_segmentation(text, max_edit_distance=0)
162
+ texts = results.segmented_string
163
+ #result = re.sub(r'[^\w\s]', '',texts).lower()
164
+ return texts
165
+
166
+ for i in range(len(sf)):
167
+ sf.loc[i, 'Segmented_Text'] = word_seg(sf.at[i, 'Segmented_Text'])
168
+ sf.loc[i, 'Lengths'] = len(tokenizer(sf.at[i, 'Segmented_Text'])['input_ids'])
169
+
170
+ texts = pd.DataFrame(columns=['texts'])
171
+
172
+ def segment_loader(dataframe):
173
+ flag = 0
174
+ for i in range(len(dataframe)):
175
+ if flag > 0:
176
+ flag -= 1
177
+ continue
178
+ m = 512
179
+ iter = 0
180
+ texts.loc[i, 'texts'] = dataframe.at[i+iter, 'Segmented_Text']
181
+ length = dataframe.at[i+iter, 'Lengths']
182
+ texts.loc[i,'video_id'] = dataframe.at[i, 'video_id']
183
+ while i+iter < len(dataframe)-1 and dataframe.at[i, 'video_id'] == dataframe.at[i+iter+1, 'video_id']:
184
+ if length + dataframe.at[i + iter + 1, 'Lengths'] <= m :
185
+ texts.loc[i,'texts'] += " " + dataframe.at[i+iter+1, 'Segmented_Text']
186
+ length += dataframe.at[i+iter + 1,'Lengths']
187
+ iter += 1
188
+ else:
189
+ break
190
+
191
+ flag = iter
192
+ return texts
193
+
194
+ cleaned_text = segment_loader(sf)
195
+ cleaned_text.reset_index(drop=True, inplace=True)
196
+
197
+ return cleaned_text
198
+
199
+
200
+ def t5_summarizer(link,start, end):
201
+ input_text = clean_text(link,start,end)
202
+ lst_outputs = []
203
+ tokenizer1 = AutoTokenizer.from_pretrained("CareerNinja/t5-large_3e-4")
204
+ model1 = AutoModelForSeq2SeqLM.from_pretrained("CareerNinja/t5-large_3e-4")
205
+ summarizer1 = pipeline("summarization", model=model1, tokenizer=tokenizer1)
206
+ print(f""" Entered summarizer ! """)
207
+ st.write('Below is the summary of the given URL: ')
208
+ for i in range(len(input_text)):
209
+ summary = summarizer1(input_text.at[i,'texts'], min_length=64, max_length=128)
210
+ sumry = list(summary[0].values())
211
+ input_text.loc[i,'Generated Summary'] = sumry[0]
212
+ lst_outputs.append(sumry[0])
213
+ st.write(input_text.at[i,'Generated Summary'])
214
+ if i != len(input_text) - 1:
215
+ st.write('=====================================================================================')
216
+ return lst_outputs
217
+
218
+ def card_creator(path, text, y_value):
219
+ img = Image.open(path)
220
+
221
+ def text_wrap(text, font, max_width):
222
+ """Wrap text base on specified width.
223
+ This is to enable text of width more than the image width to be display
224
+ nicely.
225
+ @params:
226
+ text: str
227
+ text to wrap
228
+ font: obj
229
+ font of the text
230
+ max_width: int
231
+ width to split the text with
232
+ @return
233
+ lines: list[str]
234
+ list of sub-strings
235
+ """
236
+ lines = []
237
+
238
+ # If the text width is smaller than the image width, then no need to split
239
+ # just add it to the line list and return
240
+ if font.getsize(text)[0] <= max_width:
241
+ lines.append(text)
242
+ else:
243
+ #split the line by spaces to get words
244
+ words = text.split(' ')
245
+ i = 0
246
+ # append every word to a line while its width is shorter than the image width
247
+ while i < len(words):
248
+ line = ''
249
+ while i < len(words) and font.getsize(line + words[i])[0] <= max_width:
250
+ line = line + words[i]+ " "
251
+ i += 1
252
+ if not line:
253
+ line = words[i]
254
+ i += 1
255
+ lines.append(line)
256
+ return lines
257
+
258
+
259
+ font_path = 'streamlit_app/static/Montserrat-Regular.ttf'
260
+ font = ImageFont.truetype(font=font_path, size=22)
261
+ lines = text_wrap(text, font, img.size[0] - 44)
262
+ line_height = font.getsize('hg')[1]
263
+
264
+ draw = ImageDraw.Draw(img)
265
+ #Draw text on image
266
+ color = 'rgb(255,255,255)' # white color
267
+ x = 22
268
+ y = y_value
269
+ for line in lines:
270
+ draw.text((x,y), line, fill=color, font=font)
271
+
272
+ y = y + line_height # update y-axis for new line
273
+ img.save("card.png")
274
+ st.image(img, caption="Summary Card")
275
+
276
+ def main():
277
+
278
+ if 'submitted' not in st.session_state:
279
+ st.session_state.submitted = False
280
+
281
+ if 'opt' not in st.session_state:
282
+ st.session_state.opt = []
283
+
284
+ def callback():
285
+ st.session_state.submitted = True
286
+
287
+ st.title('Video Summarizer')
288
+ url = st.text_input('Enter the Video Link')
289
+ start_pt = st.text_input('Enter the Start point in secs')
290
+ end_pt = st.text_input('Enter the end point in secs')
291
+
292
+ if (st.button("Submit URL", on_click=callback) and url) :
293
+ opt = t5_summarizer(url,start_pt,end_pt)
294
+ st.session_state.opt = opt
295
+ #st.write(st.session_state)
296
+ #text = st.text_input('Enter the Summary here to make a Summary Card.')
297
+ #text = st.selectbox('Select the summary you want to creat a card of ', opt, key="text")
298
+ #st.write('You selected:', option)
299
+ if st.session_state.submitted and st.session_state.opt:
300
+ text = st.selectbox('Select the summary you want to creat a card of ', st.session_state.opt)
301
+
302
+ option = st.selectbox('Which color template would you like to use ?',('Elf Green','Dark Pastel Green'))
303
+ if st.button("Generate Summary Card") and text and option:
304
+ if option == 'Elf Green':
305
+ if len(text) > 380 :
306
+ st.error('Summary is too long !')
307
+ else:
308
+ card_creator('C:/Users/akash/OneDrive/Pictures/iteration5_empty.png',text,335)
309
+ else :
310
+ if len(text) > 430 :
311
+ st.error('Summary is too long !')
312
+ else :
313
+ card_creator('C:/Users/akash/OneDrive/Pictures/X-93.png',text,285)
314
+
315
+ with open("card.png", "rb") as file:
316
+ btn = st.download_button(
317
+ label="Download card",
318
+ data=file,
319
+ file_name="card.png",
320
+ mime="image/png"
321
+ )
322
+
323
+ main()