Spaces:

Lenylvt
/

VideoSubtitleCreation-API

Running on Zero

App Files Files Community

Lenylvt commited on Feb 17, 2024

Commit

c18dcee

verified ·

1 Parent(s): 8c56203

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -31

app.py CHANGED Viewed

@@ -1,12 +1,8 @@
 import gradio as gr
 from faster_whisper import WhisperModel
 import logging
-import os
-from moviepy.editor import VideoFileClip
-import ffmpeg  # Make sure to install ffmpeg-python
 from transformers import MarianMTModel, MarianTokenizer
 import pandas as pd
-import pysrt
 import requests
 # Configure logging for debugging purposes
@@ -22,29 +18,18 @@ df['ISO 639-1'] = df['ISO 639-1'].str.strip()
 # Prepare language options for the dropdown
 language_options = [(row['ISO 639-1'], f"{row['ISO 639-1']}") for index, row in df.iterrows()]
-def format_timestamp(seconds):
-    """Convert seconds to HH:MM:SS.mmm format."""
-    hours = int(seconds // 3600)
-    minutes = int((seconds % 3600) // 60)
-    seconds_remainder = seconds % 60
-    return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}"
-def extract_audio(video_path):
-    """Extract audio from video to a temporary audio file."""
-    output_audio_path = '/tmp/audio.wav'
-    ffmpeg.input(video_path).output(output_audio_path, acodec='pcm_s16le', ac=1, ar='16k').run(quiet=True)
-    return output_audio_path
-def transcribe_and_optionally_translate(video_file, source_language, target_language, model_size, allow_modification):
-    audio_file = extract_audio(video_file)
     # Transcription
-    device = "cpu"  # GPU : cuda  CPU : cpu
-    compute_type = "int8"  # GPU : float16 or int8 - CPU : int8
     model = WhisperModel(model_size, device=device, compute_type=compute_type)
     segments, _ = model.transcribe(audio_file)
     transcription = " ".join([segment.text for segment in segments])
     # Translation
     if source_language != target_language:
         model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
@@ -53,7 +38,7 @@ def transcribe_and_optionally_translate(video_file, source_language, target_lang
         translated = model.generate(**tokenizer(transcription, return_tensors="pt", padding=True, truncation=True, max_length=512))
         transcription = tokenizer.decode(translated[0], skip_special_tokens=True)
-    return transcription, allow_modification
 def add_hard_subtitle_to_video(input_video, transcript):
     """Add hard subtitles to video."""
@@ -66,14 +51,16 @@ def add_hard_subtitle_to_video(input_video, transcript):
     return output_video_path
-# Gradio Interface
-def process_video(video, source_language, target_language, model_size='base', allow_modification=False, modified_transcript=None):
-    transcript, can_modify = transcribe_and_optionally_translate(video, source_language, target_language, model_size, allow_modification)
     if can_modify and modified_transcript:
-        transcript = modified_transcript  # Use the modified transcript if provided
-    # Add hard subtitles to the video
     output_video = add_hard_subtitle_to_video(video, transcript)
     return output_video
@@ -81,17 +68,18 @@ def process_video(video, source_language, target_language, model_size='base', al
 app = gr.Interface(
     fn=process_video,
     inputs=[
-        gr.Video(label="Upload Video"),
         gr.Dropdown(choices=language_options, label="Source Language"),
         gr.Dropdown(choices=language_options, label="Target Language"),
         gr.Dropdown(choices=["base", "small", "medium", "large", "large-v2", "large-v3"], label="Model Size"),
-        gr.Checkbox(label="Allow Transcript Modification?", value=False),
         gr.TextArea(label="Modified Transcript (if allowed)")
     ],
-    outputs=gr.Video(label="Processed Video with Hard Subtitles"),
     title="Video Transcription and Translation Tool",
     description="Transcribe or translate your video content. Optionally, edit the transcription before adding hard subtitles."
 )
 if __name__ == "__main__":
     app.launch()

 import gradio as gr
 from faster_whisper import WhisperModel
 import logging
 from transformers import MarianMTModel, MarianTokenizer
 import pandas as pd
 import requests
 # Configure logging for debugging purposes
 # Prepare language options for the dropdown
 language_options = [(row['ISO 639-1'], f"{row['ISO 639-1']}") for index, row in df.iterrows()]
+def transcribe_and_optionally_translate(audio_file, source_language, target_language, model_size, change_transcript):
     # Transcription
+    device = "cpu"  # Use "cuda" for GPU
+    compute_type = "int8"  # Use "float16" or "int8" for GPU, "int8" for CPU
     model = WhisperModel(model_size, device=device, compute_type=compute_type)
     segments, _ = model.transcribe(audio_file)
     transcription = " ".join([segment.text for segment in segments])
+    if change_transcript:
+        # Assume user will modify the transcript manually before translation
+        return transcription, True
     # Translation
     if source_language != target_language:
         model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
         translated = model.generate(**tokenizer(transcription, return_tensors="pt", padding=True, truncation=True, max_length=512))
         transcription = tokenizer.decode(translated[0], skip_special_tokens=True)
+    return transcription, False
 def add_hard_subtitle_to_video(input_video, transcript):
     """Add hard subtitles to video."""
     return output_video_path
+def process_video(video, source_language, target_language, model_size='base', change_transcript=False, modified_transcript=None):
+    audio_file = video  # Directly use the video file as the audio input
+    transcript, can_modify = transcribe_and_optionally_translate(audio_file, source_language, target_language, model_size, change_transcript)
     if can_modify and modified_transcript:
+        # Use the modified transcript for translation if allowed and provided
+        transcript = modified_transcript
+        # Perform translation here if necessary (similar to the previous step)
     output_video = add_hard_subtitle_to_video(video, transcript)
     return output_video
 app = gr.Interface(
     fn=process_video,
     inputs=[
+        gr.Video(label="Upload Video", type="filepath"),
         gr.Dropdown(choices=language_options, label="Source Language"),
         gr.Dropdown(choices=language_options, label="Target Language"),
         gr.Dropdown(choices=["base", "small", "medium", "large", "large-v2", "large-v3"], label="Model Size"),
+        gr.Checkbox(label="Change Transcript before Translation?", value=False),
         gr.TextArea(label="Modified Transcript (if allowed)")
     ],
+    outputs=gr.Text(label="Transcript"),
     title="Video Transcription and Translation Tool",
     description="Transcribe or translate your video content. Optionally, edit the transcription before adding hard subtitles."
 )
 if __name__ == "__main__":
     app.launch()