Spaces:
Runtime error
Runtime error
break it apart
Browse files- app.py +19 -42
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
| 1 |
import torch
|
| 2 |
-
from transformers import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import json
|
| 5 |
from difflib import Differ
|
| 6 |
import ffmpeg
|
| 7 |
from pathlib import Path
|
| 8 |
-
import aiohttp
|
| 9 |
-
import asyncio
|
| 10 |
import spaces
|
| 11 |
|
| 12 |
# Set true if you're using huggingface inference API API https://huggingface.co/inference-api
|
|
@@ -16,12 +18,16 @@ MODEL = "facebook/wav2vec2-large-960h"
|
|
| 16 |
# MODEL = "facebook/wav2vec2-base-960h"
|
| 17 |
# MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
)
|
| 26 |
|
| 27 |
|
|
@@ -36,18 +42,14 @@ for file in samples_data:
|
|
| 36 |
SAMPLES.append(sample)
|
| 37 |
VIDEOS = list(map(lambda x: [x["video"]], SAMPLES))
|
| 38 |
|
| 39 |
-
total_inferences_since_reboot = 415
|
| 40 |
-
total_cuts_since_reboot = 1539
|
| 41 |
-
|
| 42 |
|
| 43 |
@spaces.GPU(duration=120)
|
| 44 |
-
|
| 45 |
"""
|
| 46 |
Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
|
| 47 |
|
| 48 |
-
Using
|
| 49 |
"""
|
| 50 |
-
global total_inferences_since_reboot
|
| 51 |
if video_file_path == None:
|
| 52 |
raise ValueError("Error no video input")
|
| 53 |
|
|
@@ -64,10 +66,8 @@ async def speech_to_text(video_file_path):
|
|
| 64 |
except Exception as e:
|
| 65 |
raise RuntimeError("Error converting video to audio")
|
| 66 |
|
| 67 |
-
ping("speech_to_text")
|
| 68 |
-
|
| 69 |
try:
|
| 70 |
-
print(
|
| 71 |
output = speech_recognizer(
|
| 72 |
audio_memory,
|
| 73 |
return_timestamps="char",
|
|
@@ -84,24 +84,16 @@ async def speech_to_text(video_file_path):
|
|
| 84 |
]
|
| 85 |
for chunk in output["chunks"]
|
| 86 |
]
|
| 87 |
-
total_inferences_since_reboot += 1
|
| 88 |
-
|
| 89 |
-
print(
|
| 90 |
-
"\n\ntotal_inferences_since_reboot: ",
|
| 91 |
-
total_inferences_since_reboot,
|
| 92 |
-
"\n\n",
|
| 93 |
-
)
|
| 94 |
return (transcription, transcription, timestamps)
|
| 95 |
except Exception as e:
|
| 96 |
raise RuntimeError("Error Running inference with local model", e)
|
| 97 |
|
| 98 |
|
| 99 |
-
|
| 100 |
"""
|
| 101 |
Given original video input, text transcript + timestamps,
|
| 102 |
and edit ext cuts video segments into a single video
|
| 103 |
"""
|
| 104 |
-
global total_cuts_since_reboot
|
| 105 |
|
| 106 |
video_path = Path(video_in)
|
| 107 |
video_file_name = video_path.stem
|
|
@@ -156,24 +148,9 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
|
|
| 156 |
|
| 157 |
tokens = [(token[2:], token[0] if token[0] != " " else None) for token in filtered]
|
| 158 |
|
| 159 |
-
total_cuts_since_reboot += 1
|
| 160 |
-
ping("video_cuts")
|
| 161 |
-
print("\n\ntotal_cuts_since_reboot: ", total_cuts_since_reboot, "\n\n")
|
| 162 |
return (tokens, output_video)
|
| 163 |
|
| 164 |
|
| 165 |
-
def ping(name):
|
| 166 |
-
url = f"https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}"
|
| 167 |
-
print("ping: ", url)
|
| 168 |
-
|
| 169 |
-
async def req():
|
| 170 |
-
async with aiohttp.ClientSession() as session:
|
| 171 |
-
async with session.get(url) as response:
|
| 172 |
-
print("pong: ", response.status)
|
| 173 |
-
|
| 174 |
-
asyncio.create_task(req())
|
| 175 |
-
|
| 176 |
-
|
| 177 |
# ---- Gradio Layout -----
|
| 178 |
video_in = gr.Video(label="Video file", elem_id="video-container")
|
| 179 |
text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from transformers import (
|
| 3 |
+
Wav2Vec2ForCTC,
|
| 4 |
+
Wav2Vec2Processor,
|
| 5 |
+
AutomaticSpeechRecognitionPipeline,
|
| 6 |
+
)
|
| 7 |
import gradio as gr
|
| 8 |
import json
|
| 9 |
from difflib import Differ
|
| 10 |
import ffmpeg
|
| 11 |
from pathlib import Path
|
|
|
|
|
|
|
| 12 |
import spaces
|
| 13 |
|
| 14 |
# Set true if you're using huggingface inference API API https://huggingface.co/inference-api
|
|
|
|
| 18 |
# MODEL = "facebook/wav2vec2-base-960h"
|
| 19 |
# MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
|
| 20 |
|
| 21 |
+
# Load model and processor for manual processing (Spaces Zero compatible)
|
| 22 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL).to("cuda")
|
| 23 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL)
|
| 24 |
+
|
| 25 |
+
# Create pipeline with pre-loaded model and processor
|
| 26 |
+
speech_recognizer = AutomaticSpeechRecognitionPipeline(
|
| 27 |
+
model=model,
|
| 28 |
+
feature_extractor=processor.feature_extractor,
|
| 29 |
+
tokenizer=processor.tokenizer,
|
| 30 |
+
device=0, # Use first CUDA device
|
| 31 |
)
|
| 32 |
|
| 33 |
|
|
|
|
| 42 |
SAMPLES.append(sample)
|
| 43 |
VIDEOS = list(map(lambda x: [x["video"]], SAMPLES))
|
| 44 |
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
@spaces.GPU(duration=120)
|
| 47 |
+
def speech_to_text(video_file_path):
|
| 48 |
"""
|
| 49 |
Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
|
| 50 |
|
| 51 |
+
Using AutomaticSpeechRecognitionPipeline with pre-loaded model for Spaces Zero compatibility
|
| 52 |
"""
|
|
|
|
| 53 |
if video_file_path == None:
|
| 54 |
raise ValueError("Error no video input")
|
| 55 |
|
|
|
|
| 66 |
except Exception as e:
|
| 67 |
raise RuntimeError("Error converting video to audio")
|
| 68 |
|
|
|
|
|
|
|
| 69 |
try:
|
| 70 |
+
print("Transcribing via local model")
|
| 71 |
output = speech_recognizer(
|
| 72 |
audio_memory,
|
| 73 |
return_timestamps="char",
|
|
|
|
| 84 |
]
|
| 85 |
for chunk in output["chunks"]
|
| 86 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
return (transcription, transcription, timestamps)
|
| 88 |
except Exception as e:
|
| 89 |
raise RuntimeError("Error Running inference with local model", e)
|
| 90 |
|
| 91 |
|
| 92 |
+
def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
|
| 93 |
"""
|
| 94 |
Given original video input, text transcript + timestamps,
|
| 95 |
and edit ext cuts video segments into a single video
|
| 96 |
"""
|
|
|
|
| 97 |
|
| 98 |
video_path = Path(video_in)
|
| 99 |
video_file_name = video_path.stem
|
|
|
|
| 148 |
|
| 149 |
tokens = [(token[2:], token[0] if token[0] != " " else None) for token in filtered]
|
| 150 |
|
|
|
|
|
|
|
|
|
|
| 151 |
return (tokens, output_video)
|
| 152 |
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
# ---- Gradio Layout -----
|
| 155 |
video_in = gr.Video(label="Video file", elem_id="video-container")
|
| 156 |
text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
|
requirements.txt
CHANGED
|
@@ -5,5 +5,4 @@ datasets
|
|
| 5 |
librosa
|
| 6 |
ffmpeg-python
|
| 7 |
python-dotenv
|
| 8 |
-
aiohttp
|
| 9 |
spaces
|
|
|
|
| 5 |
librosa
|
| 6 |
ffmpeg-python
|
| 7 |
python-dotenv
|
|
|
|
| 8 |
spaces
|