Spaces:
Sleeping
Sleeping
File size: 9,034 Bytes
959739d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | import speech_recognition as sr
import librosa
import os
import nltk
import matplotlib.pyplot as plt
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import tensorflow
from tensorflow.keras.models import load_model # type: ignore
import numpy as np
import pandas as pd
import soundfile as sf
import statistics
from pyAudioAnalysis import audioSegmentation as aS
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
label_mapping = {
0: 'angry',
1: 'disgust',
2: 'fear',
3: 'happy',
4: 'neutral',
5: 'sad',
6: 'surprise',
}
def features_extractor(file_name):
audio, sample_rate = librosa.load(file_name, res_type='kaiser_best')
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)
# Extract Chroma Features
chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate,n_chroma=12)
chroma_scaled_features = np.mean(chroma.T, axis=0)
# Extract Mel Spectrogram Features
mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
mel_scaled_features = np.mean(mel.T, axis=0)
# Concatenate all features into a single array
features = np.hstack((mfccs_scaled_features, chroma_scaled_features, mel_scaled_features))
return features
def predict_emotions(audio_path, interval,model_s):
audio_data, samplerate = sf.read(audio_path)
duration = len(audio_data) / samplerate
emotions = []
for start in np.arange(0, duration, interval):
end = start + interval
if end > duration:
end = duration
segment = audio_data[int(start*samplerate):int(end*samplerate)]
segment_path = 'segment.wav'
sf.write(segment_path, segment, samplerate)
# Extract features
feat = features_extractor(segment_path)
if feat is not None:
feat = feat.reshape(1, -1)
predictions = np.argmax(model_s.predict(feat),axis=1)
emotions.append(label_mapping[predictions[0]])
return emotions
def recognize_speech_from_file(audio_file_path):
recognizer = sr.Recognizer() # Create a recognizer instance
audio_file = sr.AudioFile(audio_file_path) # Load the audio file
with audio_file as source: # Use the audio file as the source
audio = recognizer.record(source) # Record the audio
try:
# Recognize the speech using Google's Web Speech API
transcript = recognizer.recognize_google(audio)
return transcript # Return the transcript
except sr.UnknownValueError: # If the speech is unintelligible
return None
except sr.RequestError as e: # If there's an error with the API request
print(f"Could not request results from Google Speech Recognition service; {e}")
return None
def count_words(text):
words = text.split() # Split the text into words
return len(words) # Return the number of words
def estimate_syllables(text):
syllable_count = 0 # Initialize syllable count
words = text.split() # Split the text into words
for word in words: # Iterate through each word
# Count the vowels in the word to estimate syllables
syllable_count += len([c for c in word if c.lower() in 'aeiou'])
return syllable_count # Return the syllable count
def get_speaking_rate(file_path, transcript):
y, sr = librosa.load(file_path, sr=None) # Load the audio file
total_duration = len(y) / sr # Calculate the total duration of the audio
num_syllables = estimate_syllables(transcript) # Estimate the number of syllables
speaking_rate = num_syllables / total_duration if total_duration > 0 else 0 # Calculate the speaking rate
return speaking_rate # Return the speaking rate
def calculate_pause_metrics(file_path):
y, sr = librosa.load(file_path, sr=None) # Load the audio file
# Remove silence and get the segments
segments = aS.silence_removal(y, sr, 0.020, 0.020, smooth_window=1.0, weight=0.3, plot=False)
total_duration = len(y) / sr # Calculate the total duration
speech_duration = sum([end - start for start, end in segments]) # Calculate the speech duration
pause_duration = total_duration - speech_duration # Calculate the pause duration
num_pauses = len(segments) - 1 if len(segments) > 0 else 0 # Calculate the number of pauses
average_pause_length = pause_duration / num_pauses if num_pauses > 0 else 0 # Calculate the average pause length
return average_pause_length # Return the average pause length and number of pauses
def calculate_articulation_rate(file_path, transcript):
y, sr = librosa.load(file_path, sr=None) # Load the audio file
# Remove silence and get the segments
segments = aS.silence_removal(y, sr, 0.020, 0.020, smooth_window=1.0, weight=0.3, plot=False)
speech_duration = sum([end - start for start, end in segments]) # Calculate the speech duration
num_syllables = estimate_syllables(transcript) # Estimate the number of syllables
articulation_rate = num_syllables / speech_duration if speech_duration > 0 else 0 # Calculate the articulation rate
return articulation_rate # Return the articulation rate
def pos_tag_and_filter(transcript):
words = nltk.word_tokenize(transcript)
pos_tags = nltk.pos_tag(words)
# Define important POS tags
important_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}
filtered_words = []
for word, tag in pos_tags:
if tag in important_tags:
filtered_words.append((word, tag))
return filtered_words
def load_values(file_path):
values_dict = {}
with open(file_path, 'r') as file:
for line in file:
word, value = line.strip().split('\t')
values_dict[word.lower()] = float(value)
return values_dict
# Map values to filtered words
def map_values_to_filtered_words(filtered_words, valence_dict, arousal_dict, dominance_dict):
mapped_values = []
word_weights = {}
for word in filtered_words:
valence = valence_dict.get(word.lower())
arousal = arousal_dict.get(word.lower())
dominance = dominance_dict.get(word.lower())
if valence is not None and arousal is not None and dominance is not None:
valence=(valence+1)/2
arousal=(arousal+1)/2
mapped_values.append((word, valence, arousal,dominance,1))
# Calculate importance weight (sum of valence, arousal, and dominance)
word_weights[word] = valence + arousal + dominance
else:
mapped_values.append((word, 'not found', 'not found','not found',0))
word_weights[word] = 0
return mapped_values,word_weights
def generate_word_cloud(word_weights):
if len(word_weights)>0:
return word_weights
def analyze_audio(file_path,valence_dict,arousal_dict,dominance_dict):
# Get the transcript of the audio
# transcript = "I want you to act like he's coming back, both of you. Don't think I haven't noticed you since he in..."
transcript = recognize_speech_from_file(file_path)
print(transcript)
if not transcript: # If transcript is not available
transcript = "I want you to act like he's coming back, both of you. Don't think I haven't noticed you since he in..."
filtered_words_with_tags = pos_tag_and_filter(transcript)
filtered_words = [word for word, tag in filtered_words_with_tags]
mapped_values,word_weights = map_values_to_filtered_words(filtered_words, valence_dict, arousal_dict, dominance_dict)
# Calculate various metrics
word_weights=generate_word_cloud(word_weights)
word_count = count_words(transcript) # Count the number of words
speaking_rate = get_speaking_rate(file_path, transcript) # Calculate the speaking rate
average_pause_length = calculate_pause_metrics(file_path) # Calculate pause metrics
articulation_rate = calculate_articulation_rate(file_path, transcript) # Calculate the articulation rate
word={}
word['word_count']=word_count
word['word_weights']=word_weights
word['speaking_rate']=speaking_rate
word['average_pause_length']=average_pause_length
word['articulation_rate']=articulation_rate
word['mapped_values']=mapped_values
return word
def speech_predict(audio_path,model_s,valence_dict,arousal_dict,dominance_dict):
interval = 3.0 # Set the interval for emotion detection segments
emotions = predict_emotions(audio_path, interval,model_s)
# Save emotions to a log file
# Extrapolate major emotions
major_emotion = statistics.mode(emotions)
word = analyze_audio(audio_path,valence_dict,arousal_dict,dominance_dict)
return emotions,major_emotion,word |