Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration | |
| import torch | |
| import torchaudio | |
| import soundfile as sf | |
| # Load Whisper model and processor | |
| processor = WhisperProcessor.from_pretrained("openai/whisper-large") | |
| model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") | |
| # Load the Hugging Face emotion classifier | |
| emotion_classifier = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None) | |
| # Define a function to process audio and analyze emotions | |
| def transcribe_and_analyze(audio_path): | |
| # Load audio from the provided file | |
| audio, sample_rate = sf.read(audio_path) | |
| # Resample audio to 16000 Hz if necessary | |
| print('resample') | |
| if sample_rate != 16000: | |
| audio_tensor = torchaudio.functional.resample(torch.tensor(audio), orig_freq=sample_rate, new_freq=16000) | |
| audio = audio_tensor.numpy() # Convert back to numpy array | |
| # Process audio with Whisper | |
| # input_features = model(audio) | |
| input_features = processor(audio, sampling_rate=16000, return_tensors="pt") | |
| print(input_features) | |
| print('trans') | |
| predicted_ids = model.generate(input_features.input_features) | |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| print(transcription) | |
| # Analyze emotions in the transcription | |
| emotions = emotion_classifier(transcription) | |
| return transcription, emotions | |
| # Create Gradio interface | |
| interface = gr.Interface( | |
| fn=transcribe_and_analyze, | |
| inputs=gr.Audio(type="filepath"), # Accept audio input | |
| outputs=[ | |
| gr.Textbox(label="Transcription"), # Display transcription | |
| gr.JSON(label="Emotion Analysis") # Display emotion analysis | |
| ], | |
| title="Audio to Emotion Analysis" | |
| ) | |
| # Launch the Gradio app | |
| interface.launch() | |