AkashKhamkar commited on
Commit
bd30af9
·
1 Parent(s): 2a4739a

Create app.py

Browse files

Creating the app.py file

Files changed (1) hide show
  1. app.py +187 -0
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentence_transformers
2
+ from transformers import AutoTokenizer
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+ import os
5
+ import ast
6
+ import pandas as pd
7
+ import nltk
8
+ nltk.download('stopwords')
9
+ from pyconverse import SemanticTextSegmention
10
+ from tqdm.notebook import tqdm
11
+ import time
12
+ import random
13
+ import re
14
+ import string
15
+ from symspellpy import SymSpell, Verbosity
16
+ import pkg_resources
17
+ import torch
18
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
19
+ from torch import cuda
20
+ from transformers import pipeline
21
+
22
+
23
+ device = 'cuda' if cuda.is_available() else 'cpu'
24
+ tokenizer = AutoTokenizer.from_pretrained("t5-base")
25
+
26
+ def clean_text(link):
27
+ sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
28
+ dictionary_path = pkg_resources.resource_filename(
29
+ "symspellpy", "frequency_dictionary_en_82_765.txt"
30
+ )
31
+ sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
32
+
33
+ def id_ts_grabber(link):
34
+ youtube_video = link.split("=")
35
+ video_id = youtube_video[1]
36
+ if len(youtube_video) > 2:
37
+ time_stamp = youtube_video[2]
38
+ end_pt = youtube_video[3]
39
+ return video_id, time_stamp, end_pt
40
+ #print(f""" This is the video ID: {video_id} and this is the Timestamp: {time_stamp}""")
41
+ else:
42
+ time_stamp = None
43
+ return video_id, time_stamp
44
+ #print(f""" This is the video ID: {video_id} and no Timestamp was found""")
45
+
46
+ def seg_getter(data,ts,es):
47
+ starts = []
48
+ for line in data:
49
+ ccs = ast.literal_eval(line)
50
+ starts.append(float(ccs['start']))
51
+ #print(starts)
52
+ ts_ = float(ts.strip("s&end"))
53
+ #es_ = float(es.strip(es[-1]))
54
+ t_val = starts[min(range(len(starts)), key = lambda i: abs(starts[i]-ts_))]
55
+ e_val = starts[min(range(len(starts)), key = lambda i: abs(starts[i]-float(es)))]
56
+ tid = starts.index(t_val)
57
+ eid = starts.index(e_val)
58
+ ts_list_len = len(starts[tid:eid])
59
+ return tid, ts_list_len
60
+
61
+
62
+ def get_cc(video_id):
63
+ try:
64
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
65
+ try:
66
+ # filter for manually created transcripts
67
+ transcript = transcript_list.find_manually_created_transcript(['en','en-US','en-GB','en-IN'])
68
+ except Exception as e:
69
+ # print(e)
70
+ transcript = None
71
+
72
+ manual = True
73
+ if not transcript:
74
+ try:
75
+ # or automatically generated ones
76
+ transcript = transcript_list.find_generated_transcript(['en'])
77
+ manual = False
78
+ except Exception as e:
79
+ # print(e)
80
+ transcript = None
81
+
82
+ if transcript:
83
+ if manual: file_name = os.path.join('transcripts', str(video_id) + "_cc_manual" + ".txt")
84
+ else: file_name = os.path.join('transcripts', str(video_id) + "_cc_auto" + ".txt")
85
+ with open(file_name, 'w') as file:
86
+ for line in transcript.fetch():
87
+ file.write(str(line).replace(r'\xa0', ' ').replace(r'\n', '') + '\n')
88
+ # print(f"CC downloaded in {file_name}")
89
+ return file_name
90
+ else:
91
+ #print("No transcript found")
92
+ return None
93
+
94
+ except Exception as e:
95
+ #print(e)
96
+ return None
97
+
98
+ def transcript_creator(filename,timestamp,end_pt):
99
+ #print(filename)
100
+ with open(filename, 'r') as f:
101
+ data = f.readlines()
102
+ #print("This is data: ", data)
103
+ transcripts = []
104
+ #print("this is ts: ",timestamp)
105
+ if timestamp == None:
106
+ #print("executing 1 ")
107
+ for line in data:
108
+ ccs = ast.literal_eval(line)
109
+ transcripts.append(ccs['text'])
110
+ return transcripts
111
+ else :
112
+ #print("executing 2")
113
+ start,lenlist = seg_getter(data,timestamp,end_pt)
114
+ #print(f""" This is the ts list{ts_len}""")
115
+ for t in range(lenlist):
116
+ ccs = ast.literal_eval(data[start+t])
117
+ transcripts.append(ccs['text'])
118
+ return transcripts
119
+
120
+ def transcript_collector(link):
121
+ vid, ts, es = id_ts_grabber(link)
122
+ print(f""" Fetching the transcript """)
123
+ filename = get_cc(vid)
124
+ return transcript_creator(filename, ts, es), vid
125
+
126
+ transcript = pd.DataFrame(columns=['text', 'video_id'])
127
+ transcript.loc[0,'text'],transcript.loc[0,'video_id'] = transcript_collector(link)
128
+
129
+ def segment(corpus):
130
+ text_data = [re.sub(r'\[.*?\]', '', x).strip() for x in corpus]
131
+ text_data = [x for x in text_data if x != '']
132
+ df = pd.DataFrame(text_data, columns=["utterance"])
133
+ # remove new line, tab, return
134
+ df["utterance"] = df["utterance"].apply(lambda x: x.replace("\n", " ").replace("\r", " ").replace("\t", " "))
135
+ # remove Nan
136
+ df.dropna(inplace=True)
137
+ sts = SemanticTextSegmention(df)
138
+ texts = sts.get_segments()
139
+ return texts
140
+
141
+ sf = pd.DataFrame(columns=['Segmented_Text','video_id'])
142
+
143
+ text = segment(transcript.at[0,'text'])
144
+ for i in range(len(text)):
145
+ sf.loc[i, 'Segmented_Text'] = text[i]
146
+ sf.loc[i, 'video_id'] = transcript.at[0,'video_id']
147
+
148
+ def word_seg(text):
149
+ text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\xa0", " ")
150
+ results = sym_spell.word_segmentation(text, max_edit_distance=0)
151
+ texts = results.segmented_string
152
+ #result = re.sub(r'[^\w\s]', '',texts).lower()
153
+ return texts
154
+
155
+ for i in range(len(sf)):
156
+ sf.loc[i, 'Segmented_Text'] = word_seg(sf.at[i, 'Segmented_Text'])
157
+ sf.loc[i, 'Lengths'] = len(tokenizer(sf.at[i, 'Segmented_Text'])['input_ids'])
158
+
159
+ texts = pd.DataFrame(columns=['texts'])
160
+
161
+ def segment_loader(dataframe):
162
+ flag = 0
163
+ for i in range(len(dataframe)):
164
+ if flag > 0:
165
+ flag -= 1
166
+ continue
167
+ m = 512
168
+ iter = 0
169
+ texts.loc[i, 'texts'] = dataframe.at[i+iter, 'Segmented_Text']
170
+ length = dataframe.at[i+iter, 'Lengths']
171
+ texts.loc[i,'video_id'] = dataframe.at[i, 'video_id']
172
+ while i+iter < len(dataframe)-1 and dataframe.at[i, 'video_id'] == dataframe.at[i+iter+1, 'video_id']:
173
+ if length + dataframe.at[i + iter + 1, 'Lengths'] <= m :
174
+ texts.loc[i,'texts'] += " " + dataframe.at[i+iter+1, 'Segmented_Text']
175
+ length += dataframe.at[i+iter + 1,'Lengths']
176
+ iter += 1
177
+ else:
178
+ break
179
+
180
+ flag = iter
181
+ return texts
182
+
183
+ cleaned_text = segment_loader(sf)
184
+ cleaned_text.reset_index(drop=True, inplace=True)
185
+
186
+ return cleaned_text
187
+