Spaces:
Build error
Build error
app.py
CHANGED
|
@@ -1,23 +1,63 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
|
| 3 |
-
#final
|
| 4 |
-
import gradio as gr
|
| 5 |
-
#import json
|
| 6 |
-
#from difflib import Differ
|
| 7 |
import ffmpeg
|
| 8 |
-
#import os
|
| 9 |
from pathlib import Path
|
| 10 |
-
#import time
|
| 11 |
|
| 12 |
API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
|
| 13 |
#headers = {"Authorization": "Bearer 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"}
|
| 14 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
| 15 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 16 |
|
| 17 |
-
#convert video to audio
|
| 18 |
-
video_path = Path("./ShiaLaBeouf.mp4")
|
| 19 |
-
audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
#calling the hosted model
|
| 22 |
def query_api(audio_bytes: bytes):
|
| 23 |
"""
|
|
@@ -38,20 +78,8 @@ def query_api(audio_bytes: bytes):
|
|
| 38 |
json_reponse = json.loads(response.content.decode("utf-8"))
|
| 39 |
return json_reponse
|
| 40 |
|
| 41 |
-
#Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
|
| 42 |
-
#sending audio file in request along with stride and chunk length information
|
| 43 |
-
model_response = query_api(audio_memory)
|
| 44 |
-
|
| 45 |
-
#model response has both - transcripts as well as character timestamps or chunks
|
| 46 |
-
transcription = model_response["text"].lower()
|
| 47 |
-
chnk = model_response["chunks"]
|
| 48 |
-
|
| 49 |
-
#creating lists from chunks to consume downstream easily
|
| 50 |
-
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
|
| 51 |
-
for chunk in chnk]
|
| 52 |
|
| 53 |
-
|
| 54 |
-
#getting word timestams from character timestamps
|
| 55 |
def get_word_timestamps(timestamps):
|
| 56 |
words, word = [], []
|
| 57 |
letter_timestamp, word_timestamp, words_timestamp = [], [], []
|
|
@@ -68,15 +96,6 @@ def get_word_timestamps(timestamps):
|
|
| 68 |
words = [word.strip() for word in words]
|
| 69 |
return words, words_timestamp
|
| 70 |
|
| 71 |
-
words, words_timestamp = get_word_timestamps(timestamps)
|
| 72 |
-
#words = [word.strip() for word in words]
|
| 73 |
-
|
| 74 |
-
print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}")
|
| 75 |
-
print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
|
| 76 |
-
|
| 77 |
-
#creating list from input gif transcript
|
| 78 |
-
gif = "don't let your dreams be dreams"
|
| 79 |
-
giflist = gif.split()
|
| 80 |
|
| 81 |
#getting index of gif words in main transcript
|
| 82 |
def get_gif_word_indexes(total_words_list, gif_words_list):
|
|
@@ -90,11 +109,9 @@ def get_gif_word_indexes(total_words_list, gif_words_list):
|
|
| 90 |
if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
|
| 91 |
yield tuple(range(idx, idx+lengthgif_words_list))
|
| 92 |
|
| 93 |
-
#getting gif indexes from the generator
|
| 94 |
-
giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
|
| 95 |
|
| 96 |
#getting start and end timestamps for gif transcript
|
| 97 |
-
def get_gif_timestamps(giflist_indxs):
|
| 98 |
#giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
|
| 99 |
min_idx = min(giflist_indxs)
|
| 100 |
max_idx = max(giflist_indxs)
|
|
@@ -103,15 +120,52 @@ def get_gif_timestamps(giflist_indxs):
|
|
| 103 |
start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
|
| 104 |
return start_seconds, end_seconds
|
| 105 |
|
| 106 |
-
#getting start and end timestamps for a gif video
|
| 107 |
-
start_seconds, end_seconds = get_gif_timestamps(giflist_indxs)
|
| 108 |
|
| 109 |
#extracting the video and building and serving a .gif image
|
| 110 |
def generate_gif(start_seconds, end_seconds):
|
| 111 |
final_clip = video.subclip(start_seconds, end_seconds)
|
| 112 |
#final_clip.write_videofile("/content/gdrive/My Drive/AI/videoedit/gif1.mp4")
|
| 113 |
-
final_clip.write_gif("
|
| 114 |
final_clip.close()
|
| 115 |
return
|
| 116 |
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import ffmpeg
|
|
|
|
| 3 |
from pathlib import Path
|
|
|
|
| 4 |
|
| 5 |
API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
|
| 6 |
#headers = {"Authorization": "Bearer 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"}
|
| 7 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
| 8 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
def generate_transcripts(in_video): #generate_gifs(in_video, gif_transcript):
|
| 12 |
+
print("********* Inside generate_transcripts() **********")
|
| 13 |
+
#convert video to audio
|
| 14 |
+
print(f" input video is : {in_video}")
|
| 15 |
+
|
| 16 |
+
video_path = Path("./ShiaLaBeouf.mp4")
|
| 17 |
+
audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
|
| 18 |
+
|
| 19 |
+
#Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
|
| 20 |
+
#sending audio file in request along with stride and chunk length information
|
| 21 |
+
model_response = query_api(audio_memory)
|
| 22 |
+
|
| 23 |
+
#model response has both - transcripts as well as character timestamps or chunks
|
| 24 |
+
transcription = model_response["text"].lower()
|
| 25 |
+
chnk = model_response["chunks"]
|
| 26 |
+
|
| 27 |
+
#creating lists from chunks to consume downstream easily
|
| 28 |
+
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
|
| 29 |
+
for chunk in chnk]
|
| 30 |
+
|
| 31 |
+
#getting words and word timestamps
|
| 32 |
+
words, words_timestamp = get_word_timestamps(timestamps)
|
| 33 |
+
print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}")
|
| 34 |
+
print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
|
| 35 |
+
|
| 36 |
+
return transcription, words, words_timestamp
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def generate_gifs(gif_transcript, words, words_timestamp):
|
| 40 |
+
print("********* Inside generate_gifs() **********")
|
| 41 |
+
|
| 42 |
+
#creating list from input gif transcript
|
| 43 |
+
gif = "don't let your dreams be dreams"
|
| 44 |
+
#gif = gif_transcript
|
| 45 |
+
giflist = gif.split()
|
| 46 |
+
|
| 47 |
+
#getting gif indexes from the generator
|
| 48 |
+
giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
|
| 49 |
+
|
| 50 |
+
#getting start and end timestamps for a gif video
|
| 51 |
+
start_seconds, end_seconds = get_gif_timestamps(giflist_indxs, words_timestamp)
|
| 52 |
+
|
| 53 |
+
#generated .gif image
|
| 54 |
+
generate_gif(start_seconds, end_seconds)
|
| 55 |
+
#("./gifimage.gif")
|
| 56 |
+
html_out = "<img src='./gifimage.gif' />"
|
| 57 |
+
|
| 58 |
+
return html_out
|
| 59 |
+
|
| 60 |
+
|
| 61 |
#calling the hosted model
|
| 62 |
def query_api(audio_bytes: bytes):
|
| 63 |
"""
|
|
|
|
| 78 |
json_reponse = json.loads(response.content.decode("utf-8"))
|
| 79 |
return json_reponse
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
#getting word timestamps from character timestamps
|
|
|
|
| 83 |
def get_word_timestamps(timestamps):
|
| 84 |
words, word = [], []
|
| 85 |
letter_timestamp, word_timestamp, words_timestamp = [], [], []
|
|
|
|
| 96 |
words = [word.strip() for word in words]
|
| 97 |
return words, words_timestamp
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
#getting index of gif words in main transcript
|
| 101 |
def get_gif_word_indexes(total_words_list, gif_words_list):
|
|
|
|
| 109 |
if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
|
| 110 |
yield tuple(range(idx, idx+lengthgif_words_list))
|
| 111 |
|
|
|
|
|
|
|
| 112 |
|
| 113 |
#getting start and end timestamps for gif transcript
|
| 114 |
+
def get_gif_timestamps(giflist_indxs, words_timestamp):
|
| 115 |
#giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
|
| 116 |
min_idx = min(giflist_indxs)
|
| 117 |
max_idx = max(giflist_indxs)
|
|
|
|
| 120 |
start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
|
| 121 |
return start_seconds, end_seconds
|
| 122 |
|
|
|
|
|
|
|
| 123 |
|
| 124 |
#extracting the video and building and serving a .gif image
|
| 125 |
def generate_gif(start_seconds, end_seconds):
|
| 126 |
final_clip = video.subclip(start_seconds, end_seconds)
|
| 127 |
#final_clip.write_videofile("/content/gdrive/My Drive/AI/videoedit/gif1.mp4")
|
| 128 |
+
final_clip.write_gif("./gifimage.gif",)
|
| 129 |
final_clip.close()
|
| 130 |
return
|
| 131 |
|
| 132 |
+
|
| 133 |
+
sample_video = ['./ShiaLaBeouf.mp4']
|
| 134 |
+
sample_vid = gr.Video(label='Video file') #for displaying the example
|
| 135 |
+
examples = gr.components.Dataset(components=[sample_vid], samples=[sample_video], type='values')
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
demo = gr.Blocks()
|
| 139 |
+
|
| 140 |
+
with demo:
|
| 141 |
+
with gr.Row():
|
| 142 |
+
input_video = gr.Video(label="Upload a Video", visible=True) #for incoming video
|
| 143 |
+
text_transcript = gr.Textbox(label="Transcripts", lines = 10, interactive = True ) #to generate and display transcriptions for input video
|
| 144 |
+
text_words = gr.Textbox(visible=False)
|
| 145 |
+
text_wordstimestamps = gr.Textbox(visible=False)
|
| 146 |
+
text_gif_transcript = gr.Textbox(label="Transcripts", placeholder="Copy paste transcripts here to create GIF image" , lines = 3, interactive = True ) #to copy paste required gif transcript
|
| 147 |
+
out_gif = gr.HTML(label="Generated GIF from transcript selected", show_label=True)
|
| 148 |
+
|
| 149 |
+
examples.render()
|
| 150 |
+
def load_examples(video): #to load sample video into input_video upon clicking on it
|
| 151 |
+
print("****** inside load_example() ******")
|
| 152 |
+
print("in_video is : ", video)
|
| 153 |
+
return video
|
| 154 |
+
|
| 155 |
+
examples.click(load_examples, examples, input_video)
|
| 156 |
+
|
| 157 |
+
with gr.Row():
|
| 158 |
+
button_transcript = gr.Button("Generate transcripts")
|
| 159 |
+
button_gifs = gr.Button("Create Gif")
|
| 160 |
+
|
| 161 |
+
#def load_gif():
|
| 162 |
+
# print("****** inside load_gif() ******")
|
| 163 |
+
# #created embedding width='560' height='315'
|
| 164 |
+
# html_out = "<img src='./gifimage.gif' />"
|
| 165 |
+
# print(f"html output is : {html_out}")
|
| 166 |
+
# return
|
| 167 |
+
|
| 168 |
+
button_transcript.click(generate_transcripts, input_video, [text_transcript, text_words, text_wordstimestamps ])
|
| 169 |
+
button_gifs.click(generate_gifs, [text_gif_transcript, text_words, text_wordstimestamps], out_gif )
|
| 170 |
+
|
| 171 |
+
|