import streamlit as st import librosa import numpy as np import matplotlib.pyplot as plt from pydub import AudioSegment from transformers import T5Tokenizer, T5ForConditionalGeneration import os import whisper from collections import Counter import torch # Load T5 model and tokenizer tokenizer = T5Tokenizer.from_pretrained("t5-small") model = T5ForConditionalGeneration.from_pretrained("t5-small") def analyze_sentiment_t5(text): """Analyzes sentiment using the T5 model.""" input_text = f"sst2 sentence: {text}" # Formatting input for T5 model input_ids = tokenizer.encode(input_text, return_tensors="pt") output = model.generate(input_ids) sentiment = tokenizer.decode(output[0], skip_special_tokens=True) return "POSITIVE" if "positive" in sentiment.lower() else "NEGATIVE" # Load Whisper model whisper_model = whisper.load_model("base") def highlight_words(text, sentiment="POSITIVE"): """Highlight positive and negative words in transcription.""" # Create a list of positive and negative words (simple example, expand as needed) positive_words = {"good", "great", "awesome", "happy", "positive", "love"} negative_words = {"bad", "sad", "angry", "negative", "hate", "awful"} # Split the transcription into words words = text.split() highlighted_text = [] for word in words: if word.lower() in positive_words: highlighted_text.append(f"{word}") # Green for positive elif word.lower() in negative_words: highlighted_text.append(f"{word}") # Red for negative else: highlighted_text.append(word) # Leave neutral words unchanged # Join the words back into a string return ' '.join(highlighted_text) # Streamlit UI st.title("🎤 Audio Sentiment & Feature Analysis") st.write("Upload an MP3 file to analyze its sentiment and audio features.") uploaded_file = st.file_uploader("Choose an MP3 file", type=["mp3"]) if uploaded_file: file_path = f"temp/{uploaded_file.name}" os.makedirs("temp", exist_ok=True) with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) # Convert MP3 to WAV audio = AudioSegment.from_mp3(file_path) wav_path = file_path.replace(".mp3", ".wav") audio.export(wav_path, format="wav") # Load audio y, sr = librosa.load(wav_path, sr=None) # Get audio length in seconds audio_length = librosa.get_duration(y=y, sr=sr) # Transcribe with Whisper result = whisper_model.transcribe(wav_path) transcribed_text = result["text"] # Analyze sentiment sentiment = analyze_sentiment_t5(transcribed_text) sentiment_color = "green" if sentiment == "POSITIVE" else "red" # Highlight positive and negative words in transcription highlighted_transcription = highlight_words(transcribed_text, sentiment) # Display results st.subheader("📊 Sentiment Analysis Result") st.markdown(f"**Overall Sentiment:** {sentiment}", unsafe_allow_html=True) # Display full transcription with highlighted words st.subheader("📝 Full Transcription") st.markdown(highlighted_transcription, unsafe_allow_html=True) # Plot sentiment score vs. audio length fig, ax = plt.subplots(figsize=(10, 5)) sentiment_score = 1 if sentiment == "POSITIVE" else 0 # Simplified sentiment score: 1 for POSITIVE, 0 for NEGATIVE ax.barh(["Sentiment"], [sentiment_score], color=sentiment_color) ax.set_xlim(0, 1) ax.set_xlabel("Sentiment Score") ax.set_title(f"Sentiment Score vs. Audio Length (Duration: {audio_length:.2f} seconds)") st.pyplot(fig) # Clean up temp files os.remove(wav_path) os.remove(file_path)