Spaces:
Runtime error
Runtime error
Vaibhav Srivastav
commited on
Commit
Β·
8d69919
1
Parent(s):
f6bce7b
for the love of god please work
Browse files
app.py
CHANGED
|
@@ -7,9 +7,9 @@ from transformers import AutoProcessor, AutoModelForCTC
|
|
| 7 |
|
| 8 |
nltk.download("punkt")
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
|
| 14 |
def load_and_fix_data(input_file):
|
| 15 |
#read the file
|
|
@@ -26,7 +26,8 @@ def fix_transcription_casing(input_sentence):
|
|
| 26 |
sentences = nltk.sent_tokenize(input_sentence)
|
| 27 |
return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
|
| 28 |
|
| 29 |
-
def predict_and_ctc_decode(input_file):
|
|
|
|
| 30 |
speech = load_and_fix_data(input_file)
|
| 31 |
|
| 32 |
input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
|
|
@@ -40,7 +41,8 @@ def predict_and_ctc_decode(input_file):
|
|
| 40 |
|
| 41 |
return transcribed_text
|
| 42 |
|
| 43 |
-
def predict_and_greedy_decode(input_file):
|
|
|
|
| 44 |
speech = load_and_fix_data(input_file)
|
| 45 |
|
| 46 |
input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
|
|
@@ -54,14 +56,13 @@ def predict_and_greedy_decode(input_file):
|
|
| 54 |
return transcribed_text
|
| 55 |
|
| 56 |
def return_all_predictions(input_file, model_name):
|
| 57 |
-
|
| 58 |
-
return predict_and_ctc_decode(input_file), predict_and_greedy_decode(input_file)
|
| 59 |
|
| 60 |
|
| 61 |
gr.Interface(return_all_predictions,
|
| 62 |
inputs = [gr.inputs.Audio(source="microphone", type="filepath", label="Record/ Drop audio"), gr.inputs.Dropdown(["facebook/wav2vec2-base-960h", "facebook/hubert-large-ls960-ft"], label="Model Name")],
|
| 63 |
-
outputs = [gr.outputs.Textbox(label="Beam CTC
|
| 64 |
-
title="ASR using
|
| 65 |
-
description = "
|
| 66 |
layout = "horizontal",
|
| 67 |
examples = [["test1.wav", "facebook/wav2vec2-base-960h"], ["test2.wav", "facebook/hubert-large-ls960-ft"]], theme="huggingface").launch()
|
|
|
|
| 7 |
|
| 8 |
nltk.download("punkt")
|
| 9 |
|
| 10 |
+
|
| 11 |
+
def return_processor_and_model(model_name):
|
| 12 |
+
return AutoProcessor.from_pretrained(model_name), AutoModelForCTC.from_pretrained(model_name)
|
| 13 |
|
| 14 |
def load_and_fix_data(input_file):
|
| 15 |
#read the file
|
|
|
|
| 26 |
sentences = nltk.sent_tokenize(input_sentence)
|
| 27 |
return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
|
| 28 |
|
| 29 |
+
def predict_and_ctc_decode(input_file, model_name):
|
| 30 |
+
processor, model = return_processor_and_model(model_name)
|
| 31 |
speech = load_and_fix_data(input_file)
|
| 32 |
|
| 33 |
input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
|
|
|
|
| 41 |
|
| 42 |
return transcribed_text
|
| 43 |
|
| 44 |
+
def predict_and_greedy_decode(input_file, model_name):
|
| 45 |
+
processor, model = return_processor_and_model(model_name)
|
| 46 |
speech = load_and_fix_data(input_file)
|
| 47 |
|
| 48 |
input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
|
|
|
|
| 56 |
return transcribed_text
|
| 57 |
|
| 58 |
def return_all_predictions(input_file, model_name):
|
| 59 |
+
return predict_and_ctc_decode(input_file, model_name), predict_and_greedy_decode(input_file, model_name)
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
gr.Interface(return_all_predictions,
|
| 63 |
inputs = [gr.inputs.Audio(source="microphone", type="filepath", label="Record/ Drop audio"), gr.inputs.Dropdown(["facebook/wav2vec2-base-960h", "facebook/hubert-large-ls960-ft"], label="Model Name")],
|
| 64 |
+
outputs = [gr.outputs.Textbox(label="Beam CTC decoding"), gr.outputs.Textbox(label="Greedy decoding")],
|
| 65 |
+
title="ASR using Wav2Vec2/ Hubert & pyctcdecode",
|
| 66 |
+
description = "Comparing Wav2Vec2 & Hubert with Greedy vs Beam Search decoding",
|
| 67 |
layout = "horizontal",
|
| 68 |
examples = [["test1.wav", "facebook/wav2vec2-base-960h"], ["test2.wav", "facebook/hubert-large-ls960-ft"]], theme="huggingface").launch()
|
test.wav
DELETED
|
Binary file (165 kB)
|
|
|