Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import WhisperForConditionalGeneration, WhisperProcessor | |
| from transformers import pipeline | |
| import librosa | |
| import torch | |
| # from spleeter.separator import Separator | |
| from pydub import AudioSegment | |
| from IPython.display import Audio | |
| import os | |
| import accelerate | |
| # preprocess and crop audio file | |
| def audio_preprocess(file_name = '/test1/vocals.wav'): | |
| # separate music and vocal | |
| separator = Separator('spleeter:2stems') | |
| separator.separate_to_file(input_file, output_file) | |
| # Crop the audio | |
| start_time = 60000 # e.g. 30 seconds, 30000 | |
| end_time = 110000 # e.g. 40 seconds, 40000 | |
| audio = AudioSegment.from_file(file_name) | |
| cropped_audio = audio[start_time:end_time] | |
| processed_audio = cropped_audio | |
| # .export('cropped_vocals.wav', format='wav') # save vocal audio file | |
| return processed_audio | |
| # ASR transcription | |
| def asr_model(processed_audio): | |
| # load audio file | |
| y, sr = librosa.load(processed_audio, sr=16000) | |
| # ASR model | |
| MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1" | |
| processor = WhisperProcessor.from_pretrained(MODEL_NAME) | |
| model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True) | |
| model.config.forced_decoder_ids = None | |
| model.config.suppress_tokens = [] | |
| model.config.use_cache = False | |
| processed_in = processor(y, sampling_rate=sr, return_tensors="pt") | |
| gout = model.generate( | |
| input_features=processed_in.input_features, | |
| output_scores=True, return_dict_in_generate=True | |
| ) | |
| transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0] | |
| # print result | |
| print(f"Song lyrics = {transcription}") | |
| return transcription | |
| # sentiment analysis | |
| def senti_model(transcription): | |
| pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student") | |
| final_result = pipe(transcription) | |
| display = f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%." | |
| print(display) | |
| return display | |
| # return final_result | |
| # main | |
| def main(input_file): | |
| # processed_audio = audio_preprocess(input_file) | |
| processed_audio = input_file | |
| transcription = asr_model(processed_audio) | |
| final_result = senti_model(transcription) | |
| st.write(final_result) | |
| if st.button("Play Audio"): | |
| st.audio(audio_data['audio'], | |
| format="audio/wav", | |
| start_time=0, | |
| sample_rate = audio_data['sampling_rate']) | |
| if __name__ == '__main__': | |
| # steamlit setup | |
| st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",) | |
| st.header("Cantonese Song Sentiment Analyzer") | |
| input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song | |
| if input_file is not None: | |
| st.write("File uploaded successfully!") | |
| st.write(input_file) | |
| else: | |
| st.write("No file uploaded.") | |
| button_click = st.button("Run Analysis", type="primary") | |
| # load song | |
| #input_file = os.path.isfile("test1.mp3") | |
| # output_file = os.path.isdir("") | |
| if button_click: | |
| main(input_file=input_file) | |