Spaces:

radames
/

edit-video-by-editing-text

Runtime error

App Files Files Community

radames commited on Oct 28

Commit

9982a93

1 Parent(s): 146fed7

break it apart

Browse files

Files changed (2) hide show

app.py +19 -42
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import torch
-from transformers import pipeline
 import gradio as gr
 import json
 from difflib import Differ
 import ffmpeg
 from pathlib import Path
-import aiohttp
-import asyncio
 import spaces
 # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
@@ -16,12 +18,16 @@ MODEL = "facebook/wav2vec2-large-960h"
 # MODEL = "facebook/wav2vec2-base-960h"
 # MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
-speech_recognizer = pipeline(
-    task="automatic-speech-recognition",
-    model=f"{MODEL}",
-    tokenizer=f"{MODEL}",
-    framework="pt",
-    device="cuda",
 )
@@ -36,18 +42,14 @@ for file in samples_data:
     SAMPLES.append(sample)
 VIDEOS = list(map(lambda x: [x["video"]], SAMPLES))
-total_inferences_since_reboot = 415
-total_cuts_since_reboot = 1539
 @spaces.GPU(duration=120)
-async def speech_to_text(video_file_path):
     """
     Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
-    Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
     """
-    global total_inferences_since_reboot
     if video_file_path == None:
         raise ValueError("Error no video input")
@@ -64,10 +66,8 @@ async def speech_to_text(video_file_path):
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
-    ping("speech_to_text")
     try:
-        print(f"Transcribing via local model")
         output = speech_recognizer(
             audio_memory,
             return_timestamps="char",
@@ -84,24 +84,16 @@ async def speech_to_text(video_file_path):
             ]
             for chunk in output["chunks"]
         ]
-        total_inferences_since_reboot += 1
-        print(
-            "\n\ntotal_inferences_since_reboot: ",
-            total_inferences_since_reboot,
-            "\n\n",
-        )
         return (transcription, transcription, timestamps)
     except Exception as e:
         raise RuntimeError("Error Running inference with local model", e)
-async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
     """
     Given original video input, text transcript + timestamps,
     and edit ext cuts video segments into a single video
     """
-    global total_cuts_since_reboot
     video_path = Path(video_in)
     video_file_name = video_path.stem
@@ -156,24 +148,9 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
     tokens = [(token[2:], token[0] if token[0] != " " else None) for token in filtered]
-    total_cuts_since_reboot += 1
-    ping("video_cuts")
-    print("\n\ntotal_cuts_since_reboot: ", total_cuts_since_reboot, "\n\n")
     return (tokens, output_video)
-def ping(name):
-    url = f"https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}"
-    print("ping: ", url)
-    async def req():
-        async with aiohttp.ClientSession() as session:
-            async with session.get(url) as response:
-                print("pong: ", response.status)
-    asyncio.create_task(req())
 # ---- Gradio Layout -----
 video_in = gr.Video(label="Video file", elem_id="video-container")
 text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)

 import torch
+from transformers import (
+    Wav2Vec2ForCTC,
+    Wav2Vec2Processor,
+    AutomaticSpeechRecognitionPipeline,
+)
 import gradio as gr
 import json
 from difflib import Differ
 import ffmpeg
 from pathlib import Path
 import spaces
 # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
 # MODEL = "facebook/wav2vec2-base-960h"
 # MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
+# Load model and processor for manual processing (Spaces Zero compatible)
+model = Wav2Vec2ForCTC.from_pretrained(MODEL).to("cuda")
+processor = Wav2Vec2Processor.from_pretrained(MODEL)
+# Create pipeline with pre-loaded model and processor
+speech_recognizer = AutomaticSpeechRecognitionPipeline(
+    model=model,
+    feature_extractor=processor.feature_extractor,
+    tokenizer=processor.tokenizer,
+    device=0,  # Use first CUDA device
 )
     SAMPLES.append(sample)
 VIDEOS = list(map(lambda x: [x["video"]], SAMPLES))
 @spaces.GPU(duration=120)
+def speech_to_text(video_file_path):
     """
     Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
+    Using AutomaticSpeechRecognitionPipeline with pre-loaded model for Spaces Zero compatibility
     """
     if video_file_path == None:
         raise ValueError("Error no video input")
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
     try:
+        print("Transcribing via local model")
         output = speech_recognizer(
             audio_memory,
             return_timestamps="char",
             ]
             for chunk in output["chunks"]
         ]
         return (transcription, transcription, timestamps)
     except Exception as e:
         raise RuntimeError("Error Running inference with local model", e)
+def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
     """
     Given original video input, text transcript + timestamps,
     and edit ext cuts video segments into a single video
     """
     video_path = Path(video_in)
     video_file_name = video_path.stem
     tokens = [(token[2:], token[0] if token[0] != " " else None) for token in filtered]
     return (tokens, output_video)
 # ---- Gradio Layout -----
 video_in = gr.Video(label="Video file", elem_id="video-container")
 text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)

requirements.txt CHANGED Viewed

@@ -5,5 +5,4 @@ datasets
 librosa
 ffmpeg-python
 python-dotenv
-aiohttp
 spaces

 librosa
 ffmpeg-python
 python-dotenv
 spaces