File size: 9,727 Bytes
c476330
927c6ed
c476330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5eccfe
 
 
c476330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46dabb5
c476330
879e354
 
 
c476330
 
 
 
 
927c6ed
c476330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927c6ed
c476330
 
927c6ed
c476330
 
 
 
46dabb5
c476330
 
 
 
4bca212
c476330
4bca212
c476330
 
 
 
 
 
 
879e354
c476330
 
 
 
 
 
879e354
c476330
 
4bca212
c476330
46dabb5
c476330
 
 
 
4bccb6d
c476330
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#pip install gradio nltk youtube-transcript-api pytube gtts --quiet

from __future__ import division
import nltk
import string
import re
import io, os, time
import numpy as np
import gradio as gr
from tempfile import TemporaryFile
from gtts import gTTS
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

"""## Transcript Summary Module"""

def summarize_text(url, percent):

  # Check if the URL is valid
  try:
      youtube = YouTube(url)
  except Exception as e:
      raise gr.Error(f"Invalid YouTube URL")

  # Get transcript using youtube-transcript-api
  try:
      transcript = YouTubeTranscriptApi.get_transcript(youtube.video_id)
      Text = ' '.join([entry['text'] for entry in transcript])
  except Exception as e:
      raise gr.Error(f"Could not retrieve the video's transcript. Please try another video")

  # Clean text

  Cleaned_text = re.sub(r'[^a-zA-Z0-9\._-]', ' ', Text)
  text = word_tokenize(Cleaned_text)
  case_insensitive_text = word_tokenize(Cleaned_text.lower())

  # Sentence Segmentation

  sentences = []
  tokenized_sentences = []
  sentence = " "
  for word in text:
      if word != '.':
          sentence+=str(word)+" "
      else:
          sentences.append(sentence.strip())
          tokenized_sentences.append(word_tokenize(sentence.lower().strip()))
          sentence = " "

  def lemmatize(POS_tagged_text):

      wordnet_lemmatizer = WordNetLemmatizer()
      adjective_tags = ['JJ','JJR','JJS']
      lemmatized_text = []

      for word in POS_tagged_text:
          if word[1] in adjective_tags:
              lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
          else:
              lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun

      return lemmatized_text


  #Pre_processing:

  POS_tagged_text = nltk.pos_tag(case_insensitive_text)
  lemmatized_text = lemmatize(POS_tagged_text)
  Processed_text = nltk.pos_tag(lemmatized_text)

  def generate_stopwords(POS_tagged_text):
    stopwords = []

    wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','FW'] #may be add VBG too

    for word in POS_tagged_text:
        if word[1] not in wanted_POS:
            stopwords.append(word[0])

    punctuations = list(str(string.punctuation))
    stopwords = stopwords + punctuations

    stopword_file = open("long_stopwords.txt", "r")
    #Source = https://www.ranks.nl/stopwords

    for line in stopword_file.readlines():
        stopwords.append(str(line.strip()))

    return set(stopwords)

  stopwords = generate_stopwords(Processed_text)

  def partition_phrases(text,delimeters):
    phrases = []
    phrase = " "
    for word in text:
        if word in delimeters:
            if phrase!= " ":
                phrases.append(str(phrase).split())
            phrase = " "
        elif word not in delimeters:
            phrase+=str(word)
            phrase+=" "
    return phrases

  phrase_list = partition_phrases(lemmatized_text,stopwords)

  phrase_partitioned_sentences = []

  for sentence in tokenized_sentences:
      POS_tagged_sentence = nltk.pos_tag(sentence)
      lemmatized_sentence = lemmatize(POS_tagged_sentence)
      phrase_partitioned_sentence = partition_phrases(lemmatized_sentence,stopwords)
      phrase_partitioned_sentences.append(phrase_partitioned_sentence)

   # keyword scoring

  frequency = defaultdict(int)
  degree = defaultdict(int)
  word_score = defaultdict(float)

  vocabulary = []

  for phrase in phrase_list:
      for word in phrase:
          frequency[word]+=1
          degree[word]+=len(phrase)
          if word not in vocabulary:
              vocabulary.append(word)

  for word in vocabulary:
      word_score[word] = degree[word]/frequency[word]

  phrase_scores = []
  keywords = []
  phrase_vocabulary = []

  for phrase in phrase_list:
      if phrase not in phrase_vocabulary:
          phrase_score = 0
          for word in phrase:
              phrase_score += word_score[word]
          phrase_scores.append(phrase_score)
          phrase_vocabulary.append(phrase)


  phrase_vocabulary = []

  for phrase in phrase_list:
      if phrase not in phrase_vocabulary:
          keyword=''
          for word in phrase:
              keyword += str(word)+" "
          phrase_vocabulary.append(phrase)
          keyword = keyword.strip()
          keywords.append(keyword)

  sorted_index = np.flip(np.argsort(phrase_scores),0)

  tokenized_keywords = []
  sorted_keywords = []

  keywords_num = 0
  threshold = 50
  if len(keywords)<threshold:
      keywords_num = len(keywords)
  else:
      keywords_num = threshold

  for i in range(0,keywords_num):
      sorted_keywords.append(keywords[sorted_index[i]])
      tokenized_keywords.append(sorted_keywords[i].split())

  sentence_scores = np.zeros((len(sentences)),np.float32)
  i=0
  for sentence in phrase_partitioned_sentences:
      for phrase in sentence:
          if phrase in tokenized_keywords:

              matched_tokenized_keyword_index = tokenized_keywords.index(phrase)

              corresponding_sorted_keyword = sorted_keywords[matched_tokenized_keyword_index]

              keyword_index_where_the_sorted_keyword_is_present = keywords.index(corresponding_sorted_keyword)

              sentence_scores[i]+=phrase_scores[keyword_index_where_the_sorted_keyword_is_present]
      i+=1

  Reduce_to_percent = percent
  summary_size = int(((Reduce_to_percent)/100)*len(sentences))

  if summary_size == 0:
      summary_size = 1

  sorted_sentence_score_indices = np.flip(np.argsort(sentence_scores),0)

  indices_for_summary_results = sorted_sentence_score_indices[0:summary_size]

  summary = ""

  current_size = 0

  if 0 not in indices_for_summary_results and summary_size!=1:
      summary+=sentences[0]
      summary+=".\n\n"
      current_size+=1


  for i in range(0,len(sentences)):
      if i in indices_for_summary_results:
          summary+=sentences[i]
          summary+=".\n\n"
          current_size += 1
      if current_size == summary_size:
          break

  yt = YouTube(url)
  video_html = f'<div id="video-container" style="position: relative; width: 100%; padding-bottom: 56.25%;"><iframe id="video" style="position: absolute; width: 100%; height: 100%;" src="{yt.embed_url}" frameborder="0" allowfullscreen></iframe></div>'

  if summary == "":
      raise gr.Error(f"Could not retrieve the video's transcript. Please try another video")
  
  return summary, video_html

"""## Text-to-Speech Module"""

AUDIO_DIR = 'audio_files'
MAX_FILE_AGE = 60 * 60  # maximum age of audio files in seconds (1 hour)

def delete_old_audio_files():
    # delete audio files older than MAX_FILE_AGE
    now = time.time()
    for file_name in os.listdir(AUDIO_DIR):
        file_path = os.path.join(AUDIO_DIR, file_name)
        if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
            os.remove(file_path)

def text_to_speech(input_text):
    # create the text-to-speech audio
    tts = gTTS(input_text, lang='en', slow=False)
    fp = io.BytesIO()
    tts.write_to_fp(fp)
    fp.seek(0)

    # create the audio directory if it does not exist
    os.makedirs(AUDIO_DIR, exist_ok=True)

    # generate a unique file name for the audio file
    file_name = str(time.time()) + '.wav'
    file_path = os.path.join(AUDIO_DIR, file_name)

    # save the audio stream to a file
    with open(file_path, 'wb') as f:
        f.write(fp.read())

    # delete old audio files
    delete_old_audio_files()

    # return the file path
    return file_path

theme = gr.themes.Soft(
    primary_hue="yellow",
    secondary_hue=gr.themes.Color(c100="#f8f8f8", c200="#d9d9d9", c300="#a5b4fc", c400="#818cf8", c50="#faf0e4", c500="#6366f1", c600="#4f46e5", c700="#4338ca", c800="#3730a3", c900="#312e81", c950="#2b2c5e"),
    neutral_hue="zinc",
).set(
    body_background_fill='*secondary_50',
    block_label_background_fill='*primary_50',
    block_label_background_fill_dark='*body_background_fill',
)


with gr.Blocks(theme=theme) as demo:

  gr.Markdown(
      '''
      <h1 align="center">Educational Video Transcript Summarizer</h1>

      <h6 align="center">Welcome to SnipSnap! Input a YouTube URL to get started.</h6>
      '''
  )

  with gr.Row():
    with gr.Column():
      fn = summarize_text
      url_input = gr.Textbox(label="URL", placeholder="Ex: https://youtu.be/JOiGEI9pQBs", info="Input YouTube URL")
      slider = gr.Slider(5, 100, value=20, step=5, label="Percent", info="Choose summary length (the lower the number, the shorter the summary)")

      with gr.Row():
        summarize_btn = gr.Button(variant="primary", value="Summarize")
        clear_btn = gr.ClearButton()

      video_preview = gr.HTML(label="Video Preview")
      examples = gr.Examples([['https://youtu.be/libKVRa01L8'], ['https://youtu.be/v6Agqm4K7Ok'], ['https://youtu.be/HpcTJW4ur54'], ['https://youtu.be/gjVX47dLlN8']], inputs=url_input)

    with gr.Column():
      summary_output = gr.Textbox(label="Summary", interactive=False, show_copy_button=True)
      tts_btn = gr.Button(variant="primary", value="Text-to-Speech")
      summary_tts = gr.Audio(label="Audio", interactive=False)

    # Buttons
    summarize_btn.click(summarize_text, inputs=[url_input, slider], outputs=[summary_output, video_preview])
    tts_btn.click(text_to_speech, inputs=summary_output, outputs=summary_tts)
    clear_btn.click(lambda:[None, None, None, None], outputs=[url_input, summary_output, video_preview, summary_tts])

demo.queue()
demo.launch()