Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -59,16 +59,34 @@ LANGUANGE_MAP = {
|
|
| 59 |
}
|
| 60 |
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
model.eval()
|
| 65 |
-
model.to(device)
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
def detect_language(sentence):
|
| 74 |
|
|
@@ -80,7 +98,18 @@ def detect_language(sentence):
|
|
| 80 |
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
|
| 81 |
probability, pred_idx = torch.max(predictions, dim=-1)
|
| 82 |
language = LANGUANGE_MAP[pred_idx.item()]
|
| 83 |
-
return language, probability.item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
|
| 86 |
def process_audio_file(file, sampling_rate):
|
|
@@ -123,7 +152,7 @@ def transcribe(Microphone, File_Upload):
|
|
| 123 |
language, probability = detect_language(transcription)
|
| 124 |
|
| 125 |
return transcription.capitalize(), language, probability
|
| 126 |
-
|
| 127 |
|
| 128 |
examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
|
| 129 |
examples = [[f"./{f}"] for f in examples]
|
|
|
|
| 59 |
}
|
| 60 |
|
| 61 |
|
| 62 |
+
from pytube import YouTube
|
| 63 |
+
import whisper
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
# define function for transcription
|
| 66 |
+
def transcribe(Microphone, File_Upload):
|
| 67 |
+
warn_output = ""
|
| 68 |
+
if (Microphone is not None) and (File_Upload is not None):
|
| 69 |
+
warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
|
| 70 |
+
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
|
| 71 |
+
file = Microphone
|
| 72 |
|
| 73 |
+
elif (Microphone is None) and (File_Upload is None):
|
| 74 |
+
return "ERROR: You have to either use the microphone or upload an audio file"
|
| 75 |
|
| 76 |
+
elif Microphone is not None:
|
| 77 |
+
file = Microphone
|
| 78 |
+
else:
|
| 79 |
+
file = File_Upload
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
language = None
|
| 83 |
+
|
| 84 |
+
options = whisper.DecodingOptions(without_timestamps=True)
|
| 85 |
|
| 86 |
+
loaded_model = whisper.load_model("base")
|
| 87 |
+
transcript = loaded_model.transcribe(file, language=language)
|
| 88 |
+
|
| 89 |
+
return detect_language(transcript["text"])
|
| 90 |
|
| 91 |
def detect_language(sentence):
|
| 92 |
|
|
|
|
| 98 |
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
|
| 99 |
probability, pred_idx = torch.max(predictions, dim=-1)
|
| 100 |
language = LANGUANGE_MAP[pred_idx.item()]
|
| 101 |
+
return sentence, language, probability.item()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
"""
|
| 105 |
+
processor = WhisperProcessor.from_pretrained(model_id)
|
| 106 |
+
model = WhisperForConditionalGeneration.from_pretrained(model_id)
|
| 107 |
+
model.eval()
|
| 108 |
+
model.to(device)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
bos_token_id = processor.tokenizer.all_special_ids[-106]
|
| 112 |
+
decoder_input_ids = torch.tensor([bos_token_id]).to(device)
|
| 113 |
|
| 114 |
|
| 115 |
def process_audio_file(file, sampling_rate):
|
|
|
|
| 152 |
language, probability = detect_language(transcription)
|
| 153 |
|
| 154 |
return transcription.capitalize(), language, probability
|
| 155 |
+
"""
|
| 156 |
|
| 157 |
examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
|
| 158 |
examples = [[f"./{f}"] for f in examples]
|