Spaces:
Sleeping
Sleeping
Commit ·
d4362f5
1
Parent(s): aa5f8c6
up
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
|
| 2 |
import gradio as gr
|
| 3 |
from faster_whisper import WhisperModel
|
| 4 |
import whisper_timestamped as whisper
|
|
@@ -8,18 +7,25 @@ from utils import *
|
|
| 8 |
model = WhisperModel("large-v2")
|
| 9 |
|
| 10 |
|
| 11 |
-
|
| 12 |
-
FinetuneModel = AutoModelForSeq2SeqLM.from_pretrained("FinetuneModel/VoidFilter")
|
| 13 |
|
| 14 |
def VoidFilter(audio_file):
|
| 15 |
audio = whisper.load_audio(audio_file)
|
|
|
|
| 16 |
segments, _ = model.transcribe(audio_file, word_timestamps=True)
|
|
|
|
| 17 |
word_timestamps = get_word_timestamps(segments)
|
|
|
|
| 18 |
transcription = get_transcription(word_timestamps)
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
modified_timestamps = get_modified_timestamps(word_timestamps,filtered_text)
|
|
|
|
| 21 |
final_audio, sample_rate = cut_audio(audio, modified_timestamps)
|
|
|
|
| 22 |
sf.write("filtered_audio.wav", final_audio, sample_rate)
|
|
|
|
| 23 |
return "filtered_audio.wav"
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from faster_whisper import WhisperModel
|
| 3 |
import whisper_timestamped as whisper
|
|
|
|
| 7 |
model = WhisperModel("large-v2")
|
| 8 |
|
| 9 |
|
| 10 |
+
|
|
|
|
| 11 |
|
| 12 |
def VoidFilter(audio_file):
|
| 13 |
audio = whisper.load_audio(audio_file)
|
| 14 |
+
|
| 15 |
segments, _ = model.transcribe(audio_file, word_timestamps=True)
|
| 16 |
+
|
| 17 |
word_timestamps = get_word_timestamps(segments)
|
| 18 |
+
|
| 19 |
transcription = get_transcription(word_timestamps)
|
| 20 |
+
|
| 21 |
+
filtered_text = filterText(transcription)
|
| 22 |
+
|
| 23 |
modified_timestamps = get_modified_timestamps(word_timestamps,filtered_text)
|
| 24 |
+
|
| 25 |
final_audio, sample_rate = cut_audio(audio, modified_timestamps)
|
| 26 |
+
|
| 27 |
sf.write("filtered_audio.wav", final_audio, sample_rate)
|
| 28 |
+
|
| 29 |
return "filtered_audio.wav"
|
| 30 |
|
| 31 |
|
utils.py
CHANGED
|
@@ -1,4 +1,8 @@
|
|
| 1 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def cut_audio(audio, timestamps):
|
|
@@ -69,48 +73,10 @@ def get_modified_timestamps(word_timestamps,filtered_text):
|
|
| 69 |
return mod_timestemp
|
| 70 |
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
# # Setup the tokenizer for targets
|
| 77 |
-
# labels = tokenizer(text_target=examples["filtered"], max_length=29, truncation=True)
|
| 78 |
-
|
| 79 |
-
# model_inputs["labels"] = labels["input_ids"]
|
| 80 |
-
# return model_inputs
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
# def compute_metrics(eval_pred):
|
| 84 |
-
# predictions, labels = eval_pred
|
| 85 |
-
# decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
| 86 |
-
# # Replace -100 in the labels as we can't decode them.
|
| 87 |
-
# labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
| 88 |
-
# decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
| 89 |
-
|
| 90 |
-
# decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
|
| 91 |
-
# decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
|
| 92 |
-
|
| 93 |
-
# result = {}
|
| 94 |
-
# rouge = Rouge()
|
| 95 |
-
# scores = rouge.get_scores(decoded_preds, decoded_labels, avg=True)
|
| 96 |
-
|
| 97 |
-
# for k, v in scores.items():
|
| 98 |
-
# if isinstance(v, dict):
|
| 99 |
-
# result.update({f'rouge-{k}': {m: round(val * 100, 4) for m, val in v.items()}})
|
| 100 |
-
# else:
|
| 101 |
-
# result[f'rouge-{k}'] = round(v * 100, 4)
|
| 102 |
-
|
| 103 |
-
# prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
|
| 104 |
-
# result["gen_len"] = np.mean(prediction_lens)
|
| 105 |
-
# result = {
|
| 106 |
-
# key: value['f']
|
| 107 |
-
# for key, value in result.items()
|
| 108 |
-
# if isinstance(value, dict) and 'f' in value
|
| 109 |
-
# }
|
| 110 |
-
# return result
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
def filterText(text, model,tokenizer):
|
| 114 |
device = 'cpu'
|
| 115 |
model = model.to(device)
|
| 116 |
|
|
@@ -133,10 +99,7 @@ def filterText(text, model,tokenizer):
|
|
| 133 |
repetition_penalty=2.0,
|
| 134 |
length_penalty=0.5,
|
| 135 |
num_beams = 10,
|
| 136 |
-
num_return_sequences=1,
|
| 137 |
-
# top_k = 10,
|
| 138 |
-
# do_sample = True,
|
| 139 |
-
# top_p = 0.97
|
| 140 |
)
|
| 141 |
preds = [
|
| 142 |
tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
+
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
|
| 3 |
+
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("FinetuneModel/VoidFilter")
|
| 5 |
+
FinetuneModel = AutoModelForSeq2SeqLM.from_pretrained("FinetuneModel/VoidFilter")
|
| 6 |
|
| 7 |
|
| 8 |
def cut_audio(audio, timestamps):
|
|
|
|
| 73 |
return mod_timestemp
|
| 74 |
|
| 75 |
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def filterText(text, model=FinetuneModel,tokenizer=tokenizer):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
device = 'cpu'
|
| 81 |
model = model.to(device)
|
| 82 |
|
|
|
|
| 99 |
repetition_penalty=2.0,
|
| 100 |
length_penalty=0.5,
|
| 101 |
num_beams = 10,
|
| 102 |
+
num_return_sequences=1,
|
|
|
|
|
|
|
|
|
|
| 103 |
)
|
| 104 |
preds = [
|
| 105 |
tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|