Spaces:

AkashKhamkar
/

test1

Runtime error

App Files Files Community

AkashKhamkar commited on Sep 8, 2022

Commit

bd30af9

1 Parent(s): 2a4739a

Create app.py

Browse files

Creating the app.py file

Files changed (1) hide show

app.py +187 -0

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import sentence_transformers
+from transformers import AutoTokenizer
+from youtube_transcript_api import YouTubeTranscriptApi
+import os
+import ast
+import pandas as pd
+import nltk
+nltk.download('stopwords')
+from pyconverse import SemanticTextSegmention
+from tqdm.notebook import tqdm
+import time
+import random
+import re
+import string
+from symspellpy import SymSpell, Verbosity
+import pkg_resources
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from torch import cuda
+from transformers import pipeline
+device = 'cuda' if cuda.is_available() else 'cpu'
+tokenizer = AutoTokenizer.from_pretrained("t5-base")
+def clean_text(link):
+  sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
+  dictionary_path = pkg_resources.resource_filename(
+      "symspellpy", "frequency_dictionary_en_82_765.txt"
+      )
+  sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
+  def id_ts_grabber(link):
+    youtube_video = link.split("=")
+    video_id = youtube_video[1]
+    if len(youtube_video) > 2:
+      time_stamp = youtube_video[2]
+      end_pt = youtube_video[3]
+      return video_id, time_stamp, end_pt
+      #print(f""" This is the video ID: {video_id} and this is the Timestamp: {time_stamp}""")
+    else:
+      time_stamp = None
+      return video_id, time_stamp
+      #print(f""" This is the video ID: {video_id} and no Timestamp was found""")
+  def seg_getter(data,ts,es):
+    starts = []
+    for line in data:
+        ccs = ast.literal_eval(line)
+        starts.append(float(ccs['start']))
+    #print(starts)
+    ts_ = float(ts.strip("s&end"))
+    #es_ = float(es.strip(es[-1]))
+    t_val = starts[min(range(len(starts)), key = lambda i: abs(starts[i]-ts_))]
+    e_val = starts[min(range(len(starts)), key = lambda i: abs(starts[i]-float(es)))]
+    tid = starts.index(t_val)
+    eid = starts.index(e_val)
+    ts_list_len = len(starts[tid:eid])
+    return tid, ts_list_len
+  def get_cc(video_id):
+    try:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        try:
+            # filter for manually created transcripts
+            transcript = transcript_list.find_manually_created_transcript(['en','en-US','en-GB','en-IN'])
+        except Exception as e:
+            # print(e)
+            transcript = None
+        manual = True
+        if not transcript:
+            try:
+                # or automatically generated ones
+                transcript = transcript_list.find_generated_transcript(['en'])
+                manual = False
+            except Exception as e:
+                # print(e)
+                transcript = None
+        if transcript:
+            if manual: file_name = os.path.join('transcripts', str(video_id) + "_cc_manual" + ".txt")
+            else: file_name = os.path.join('transcripts', str(video_id) + "_cc_auto" + ".txt")
+            with open(file_name, 'w') as file:
+                for line in transcript.fetch():
+                    file.write(str(line).replace(r'\xa0', ' ').replace(r'\n', '') + '\n')
+            # print(f"CC downloaded in {file_name}")
+            return file_name
+        else:
+            #print("No transcript found")
+            return None
+    except Exception as e:
+        #print(e)
+        return None
+  def transcript_creator(filename,timestamp,end_pt):
+    #print(filename)
+    with open(filename, 'r') as f:
+      data = f.readlines()
+    #print("This is data: ", data)
+    transcripts = []
+    #print("this is ts: ",timestamp)
+    if timestamp == None:
+      #print("executing 1 ")
+      for line in data:
+        ccs = ast.literal_eval(line)
+        transcripts.append(ccs['text'])
+      return transcripts
+    else :
+      #print("executing 2")
+      start,lenlist = seg_getter(data,timestamp,end_pt)
+      #print(f""" This is the ts list{ts_len}""")
+      for t in range(lenlist):
+        ccs = ast.literal_eval(data[start+t])
+        transcripts.append(ccs['text'])
+      return transcripts
+  def transcript_collector(link):
+    vid, ts, es = id_ts_grabber(link)
+    print(f""" Fetching the transcript """)
+    filename = get_cc(vid)
+    return transcript_creator(filename, ts, es), vid
+  transcript = pd.DataFrame(columns=['text', 'video_id'])
+  transcript.loc[0,'text'],transcript.loc[0,'video_id'] = transcript_collector(link)
+  def segment(corpus):
+    text_data = [re.sub(r'\[.*?\]', '', x).strip() for x in corpus]
+    text_data = [x for x in text_data if x != '']
+    df = pd.DataFrame(text_data, columns=["utterance"])
+    # remove new line, tab, return
+    df["utterance"] = df["utterance"].apply(lambda x: x.replace("\n", " ").replace("\r", " ").replace("\t", " "))
+    # remove Nan
+    df.dropna(inplace=True)
+    sts = SemanticTextSegmention(df)
+    texts = sts.get_segments()
+    return texts
+  sf = pd.DataFrame(columns=['Segmented_Text','video_id'])
+  text = segment(transcript.at[0,'text'])
+  for i in range(len(text)):
+    sf.loc[i, 'Segmented_Text'] = text[i]
+    sf.loc[i, 'video_id'] = transcript.at[0,'video_id']
+  def word_seg(text):
+    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\xa0", " ")
+    results = sym_spell.word_segmentation(text, max_edit_distance=0)
+    texts = results.segmented_string
+    #result = re.sub(r'[^\w\s]', '',texts).lower()
+    return texts
+  for i in range(len(sf)):
+    sf.loc[i, 'Segmented_Text'] = word_seg(sf.at[i, 'Segmented_Text'])
+    sf.loc[i, 'Lengths'] = len(tokenizer(sf.at[i, 'Segmented_Text'])['input_ids'])
+  texts = pd.DataFrame(columns=['texts'])
+  def segment_loader(dataframe):
+    flag = 0
+    for i in range(len(dataframe)):
+      if flag > 0:
+        flag -= 1
+        continue
+      m = 512
+      iter = 0
+      texts.loc[i, 'texts'] = dataframe.at[i+iter, 'Segmented_Text']
+      length = dataframe.at[i+iter, 'Lengths']
+      texts.loc[i,'video_id'] = dataframe.at[i, 'video_id']
+      while i+iter < len(dataframe)-1 and dataframe.at[i, 'video_id'] == dataframe.at[i+iter+1, 'video_id']:
+          if length + dataframe.at[i + iter + 1, 'Lengths'] <= m :
+              texts.loc[i,'texts'] +=  " " + dataframe.at[i+iter+1, 'Segmented_Text']
+              length += dataframe.at[i+iter + 1,'Lengths']
+              iter += 1
+          else:
+              break
+      flag = iter
+    return texts
+  cleaned_text = segment_loader(sf)
+  cleaned_text.reset_index(drop=True, inplace=True)
+  return cleaned_text