radames commited on
Commit
9982a93
·
1 Parent(s): 146fed7

break it apart

Browse files
Files changed (2) hide show
  1. app.py +19 -42
  2. requirements.txt +0 -1
app.py CHANGED
@@ -1,12 +1,14 @@
1
  import torch
2
- from transformers import pipeline
 
 
 
 
3
  import gradio as gr
4
  import json
5
  from difflib import Differ
6
  import ffmpeg
7
  from pathlib import Path
8
- import aiohttp
9
- import asyncio
10
  import spaces
11
 
12
  # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
@@ -16,12 +18,16 @@ MODEL = "facebook/wav2vec2-large-960h"
16
  # MODEL = "facebook/wav2vec2-base-960h"
17
  # MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
18
 
19
- speech_recognizer = pipeline(
20
- task="automatic-speech-recognition",
21
- model=f"{MODEL}",
22
- tokenizer=f"{MODEL}",
23
- framework="pt",
24
- device="cuda",
 
 
 
 
25
  )
26
 
27
 
@@ -36,18 +42,14 @@ for file in samples_data:
36
  SAMPLES.append(sample)
37
  VIDEOS = list(map(lambda x: [x["video"]], SAMPLES))
38
 
39
- total_inferences_since_reboot = 415
40
- total_cuts_since_reboot = 1539
41
-
42
 
43
  @spaces.GPU(duration=120)
44
- async def speech_to_text(video_file_path):
45
  """
46
  Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
47
 
48
- Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
49
  """
50
- global total_inferences_since_reboot
51
  if video_file_path == None:
52
  raise ValueError("Error no video input")
53
 
@@ -64,10 +66,8 @@ async def speech_to_text(video_file_path):
64
  except Exception as e:
65
  raise RuntimeError("Error converting video to audio")
66
 
67
- ping("speech_to_text")
68
-
69
  try:
70
- print(f"Transcribing via local model")
71
  output = speech_recognizer(
72
  audio_memory,
73
  return_timestamps="char",
@@ -84,24 +84,16 @@ async def speech_to_text(video_file_path):
84
  ]
85
  for chunk in output["chunks"]
86
  ]
87
- total_inferences_since_reboot += 1
88
-
89
- print(
90
- "\n\ntotal_inferences_since_reboot: ",
91
- total_inferences_since_reboot,
92
- "\n\n",
93
- )
94
  return (transcription, transcription, timestamps)
95
  except Exception as e:
96
  raise RuntimeError("Error Running inference with local model", e)
97
 
98
 
99
- async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
100
  """
101
  Given original video input, text transcript + timestamps,
102
  and edit ext cuts video segments into a single video
103
  """
104
- global total_cuts_since_reboot
105
 
106
  video_path = Path(video_in)
107
  video_file_name = video_path.stem
@@ -156,24 +148,9 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
156
 
157
  tokens = [(token[2:], token[0] if token[0] != " " else None) for token in filtered]
158
 
159
- total_cuts_since_reboot += 1
160
- ping("video_cuts")
161
- print("\n\ntotal_cuts_since_reboot: ", total_cuts_since_reboot, "\n\n")
162
  return (tokens, output_video)
163
 
164
 
165
- def ping(name):
166
- url = f"https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}"
167
- print("ping: ", url)
168
-
169
- async def req():
170
- async with aiohttp.ClientSession() as session:
171
- async with session.get(url) as response:
172
- print("pong: ", response.status)
173
-
174
- asyncio.create_task(req())
175
-
176
-
177
  # ---- Gradio Layout -----
178
  video_in = gr.Video(label="Video file", elem_id="video-container")
179
  text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
 
1
  import torch
2
+ from transformers import (
3
+ Wav2Vec2ForCTC,
4
+ Wav2Vec2Processor,
5
+ AutomaticSpeechRecognitionPipeline,
6
+ )
7
  import gradio as gr
8
  import json
9
  from difflib import Differ
10
  import ffmpeg
11
  from pathlib import Path
 
 
12
  import spaces
13
 
14
  # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
 
18
  # MODEL = "facebook/wav2vec2-base-960h"
19
  # MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
20
 
21
+ # Load model and processor for manual processing (Spaces Zero compatible)
22
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL).to("cuda")
23
+ processor = Wav2Vec2Processor.from_pretrained(MODEL)
24
+
25
+ # Create pipeline with pre-loaded model and processor
26
+ speech_recognizer = AutomaticSpeechRecognitionPipeline(
27
+ model=model,
28
+ feature_extractor=processor.feature_extractor,
29
+ tokenizer=processor.tokenizer,
30
+ device=0, # Use first CUDA device
31
  )
32
 
33
 
 
42
  SAMPLES.append(sample)
43
  VIDEOS = list(map(lambda x: [x["video"]], SAMPLES))
44
 
 
 
 
45
 
46
  @spaces.GPU(duration=120)
47
+ def speech_to_text(video_file_path):
48
  """
49
  Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
50
 
51
+ Using AutomaticSpeechRecognitionPipeline with pre-loaded model for Spaces Zero compatibility
52
  """
 
53
  if video_file_path == None:
54
  raise ValueError("Error no video input")
55
 
 
66
  except Exception as e:
67
  raise RuntimeError("Error converting video to audio")
68
 
 
 
69
  try:
70
+ print("Transcribing via local model")
71
  output = speech_recognizer(
72
  audio_memory,
73
  return_timestamps="char",
 
84
  ]
85
  for chunk in output["chunks"]
86
  ]
 
 
 
 
 
 
 
87
  return (transcription, transcription, timestamps)
88
  except Exception as e:
89
  raise RuntimeError("Error Running inference with local model", e)
90
 
91
 
92
+ def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
93
  """
94
  Given original video input, text transcript + timestamps,
95
  and edit ext cuts video segments into a single video
96
  """
 
97
 
98
  video_path = Path(video_in)
99
  video_file_name = video_path.stem
 
148
 
149
  tokens = [(token[2:], token[0] if token[0] != " " else None) for token in filtered]
150
 
 
 
 
151
  return (tokens, output_video)
152
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  # ---- Gradio Layout -----
155
  video_in = gr.Video(label="Video file", elem_id="video-container")
156
  text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
requirements.txt CHANGED
@@ -5,5 +5,4 @@ datasets
5
  librosa
6
  ffmpeg-python
7
  python-dotenv
8
- aiohttp
9
  spaces
 
5
  librosa
6
  ffmpeg-python
7
  python-dotenv
 
8
  spaces