Update app.py
Browse files
app.py
CHANGED
|
@@ -51,7 +51,34 @@ def predict_emotion_from_audio(wav_filepath):
|
|
| 51 |
api_key = os.getenv("DeepAI_api_key")
|
| 52 |
# Define the API key for DeepAI Text to Image API
|
| 53 |
#api_key = 'dee3e3f2-d5cf-474c-8072-bd6bea47e865'
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
# Predict emotion from audio
|
| 56 |
def get_predictions(audio_input):
|
| 57 |
emotion_prediction = predict_emotion_from_audio(audio_input)
|
|
|
|
| 51 |
api_key = os.getenv("DeepAI_api_key")
|
| 52 |
# Define the API key for DeepAI Text to Image API
|
| 53 |
#api_key = 'dee3e3f2-d5cf-474c-8072-bd6bea47e865'
|
| 54 |
+
|
| 55 |
+
####
|
| 56 |
+
import torch
|
| 57 |
+
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# Load the pretrained model and processor
|
| 61 |
+
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
|
| 62 |
+
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
|
| 63 |
+
|
| 64 |
+
# Load your local audio file
|
| 65 |
+
#audio_input, sampling_rate = sf.read("/content/1001_IEO_DIS_HI.wav")
|
| 66 |
+
|
| 67 |
+
# Convert the audio to PyTorch tensors using the processor
|
| 68 |
+
inputs = processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt")
|
| 69 |
+
|
| 70 |
+
# Generate transcription in English
|
| 71 |
+
generated_ids = model.generate(
|
| 72 |
+
inputs["input_features"],
|
| 73 |
+
attention_mask=inputs["attention_mask"],
|
| 74 |
+
forced_bos_token_id=processor.tokenizer.bos_token_id # Use the <s> token ID as the start of sequence token
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Decode the generated transcription
|
| 78 |
+
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 79 |
+
|
| 80 |
+
###
|
| 81 |
+
|
| 82 |
# Predict emotion from audio
|
| 83 |
def get_predictions(audio_input):
|
| 84 |
emotion_prediction = predict_emotion_from_audio(audio_input)
|