Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,71 +3,71 @@ import torch
|
|
| 3 |
from faster_whisper import WhisperModel
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
#
|
| 7 |
model_size = "large-v2"
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
|
| 12 |
-
# Initialize model based on
|
| 13 |
-
device = get_device()
|
| 14 |
if device == "cuda:0":
|
|
|
|
| 15 |
model_whisper = WhisperModel(model_size, device="cuda", compute_type="float16")
|
| 16 |
else:
|
|
|
|
| 17 |
model_whisper = WhisperModel(model_size, device="cpu", compute_type="int8")
|
| 18 |
|
|
|
|
| 19 |
def get_filename(file_obj):
|
| 20 |
return file_obj.name.split("/")[-1]
|
| 21 |
|
|
|
|
| 22 |
def audio_to_transcript(file_obj):
|
| 23 |
-
"""Converts uploaded audio to a transcript with timestamps."""
|
| 24 |
-
if not file_obj:
|
| 25 |
-
return "No file uploaded.", None, None
|
| 26 |
-
|
| 27 |
-
filename = get_filename(file_obj)
|
| 28 |
try:
|
|
|
|
| 29 |
segments, _ = model_whisper.transcribe(file_obj.name, beam_size=5, vad_filter=True)
|
| 30 |
except:
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
for segment in segments:
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
df.to_csv(csv_file, encoding="utf-8", index=False)
|
| 42 |
-
|
| 43 |
-
return filename, csv_file, df
|
| 44 |
|
| 45 |
-
|
| 46 |
-
definition = """
|
| 47 |
-
## 🎙️ Audio Transcription App
|
| 48 |
-
This app allows you to upload an audio file and get an accurate transcript with timestamps.
|
| 49 |
-
It uses **Faster-Whisper**, a fast and efficient ASR model, to generate transcriptions.
|
| 50 |
-
Simply upload your file, and the app will process and return a CSV transcript.
|
| 51 |
-
"""
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
audio_input.change(update_file_name, inputs=[audio_input], outputs=[file_preview])
|
| 71 |
-
transcribe_btn.click(audio_to_transcript, inputs=[audio_input], outputs=[file_preview, csv_output, transcript_df], show_progress=True)
|
| 72 |
|
| 73 |
iface.launch(debug=True)
|
|
|
|
| 3 |
from faster_whisper import WhisperModel
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
# Model size selection
|
| 7 |
model_size = "large-v2"
|
| 8 |
|
| 9 |
+
# Get device
|
| 10 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 11 |
|
| 12 |
+
# Initialize model based on device
|
|
|
|
| 13 |
if device == "cuda:0":
|
| 14 |
+
# Run on GPU with FP16
|
| 15 |
model_whisper = WhisperModel(model_size, device="cuda", compute_type="float16")
|
| 16 |
else:
|
| 17 |
+
# Run on CPU with INT8
|
| 18 |
model_whisper = WhisperModel(model_size, device="cpu", compute_type="int8")
|
| 19 |
|
| 20 |
+
# Function to get filename from file object
|
| 21 |
def get_filename(file_obj):
|
| 22 |
return file_obj.name.split("/")[-1]
|
| 23 |
|
| 24 |
+
# Function to transcribe audio to text
|
| 25 |
def audio_to_transcript(file_obj):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
try:
|
| 27 |
+
filename = get_filename(file_obj)
|
| 28 |
segments, _ = model_whisper.transcribe(file_obj.name, beam_size=5, vad_filter=True)
|
| 29 |
except:
|
| 30 |
+
filename = file_obj.split("/")[-1]
|
| 31 |
+
segments, _ = model_whisper.transcribe(file_obj, beam_size=5, vad_filter=True)
|
| 32 |
+
|
| 33 |
+
# Initialize lists to store transcription data
|
| 34 |
+
start_segments, end_segments, text_segments = list(), list(), list()
|
| 35 |
|
| 36 |
+
# Process each segment for start time, end time, and text
|
| 37 |
for segment in segments:
|
| 38 |
+
start, end, text = segment.start, segment.end, segment.text
|
| 39 |
+
start_segments.append(start)
|
| 40 |
+
end_segments.append(end)
|
| 41 |
+
text_segments.append(text)
|
| 42 |
+
|
| 43 |
+
# Save transcript to CSV
|
| 44 |
+
df = pd.DataFrame()
|
| 45 |
+
df["start"] = start_segments
|
| 46 |
+
df["end"] = end_segments
|
| 47 |
+
df["text"] = text_segments
|
| 48 |
+
|
| 49 |
+
# Define output CSV file
|
| 50 |
+
csv_file = filename.split(".")[0] + ".csv"
|
| 51 |
df.to_csv(csv_file, encoding="utf-8", index=False)
|
| 52 |
+
path_to_csv = gr.File.update(value=csv_file, visible=True)
|
|
|
|
| 53 |
|
| 54 |
+
return filename, path_to_csv, df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
## Gradio Interface Setup
|
| 57 |
+
headers = ["start", "end", "text"]
|
| 58 |
+
|
| 59 |
+
iface = gr.Interface(
|
| 60 |
+
fn=audio_to_transcript,
|
| 61 |
+
inputs=gr.File(label="Upload an Audio File", type="filepath"),
|
| 62 |
+
outputs=[
|
| 63 |
+
gr.Textbox(label="Audio file name"),
|
| 64 |
+
gr.File(label="Transcript CSV file"),
|
| 65 |
+
gr.DataFrame(label="Transcript", headers=headers),
|
| 66 |
+
],
|
| 67 |
+
allow_flagging="never",
|
| 68 |
+
title="Audio to Transcript",
|
| 69 |
+
description="Upload an audio file, and this tool will return a transcript with time-stamped segments.",
|
| 70 |
+
theme="compact", # Enhanced UI theme for simplicity
|
| 71 |
+
)
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
iface.launch(debug=True)
|