Spaces:
Sleeping
Sleeping
File size: 5,442 Bytes
e61da93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import librosa
import numpy as np
import torch
from collections import Counter
import nltk
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
def get_pitch_list(y,sr):
hop_length = int(sr / 30) # hop_length determines how far apart the frames are
# Extract the pitch (F0) using librosa's piptrack method
pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)
# Get the pitch frequencies from the pitch array
pitch_frequencies = []
for t in range(pitches.shape[1]):
index = magnitudes[:, t].argmax() # Get the index of the maximum magnitude
pitch = pitches[index, t]
pitch_frequencies.append(pitch)
# Convert pitch_frequencies to a NumPy array
pitch_frequencies = np.array(pitch_frequencies)
print("shape : ",pitch_frequencies.shape)
return pitch_frequencies
def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
y, sr = librosa.load(audio_path, sr=16000)
inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
inputs = inputs.to(device, dtype=torch_dtype)
with torch.no_grad():
generated_ids = asrmodel.generate(inputs)
transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Sound intensity (RMS)
rms = librosa.feature.rms(y=y)
sound_intensity = np.mean(rms)
# Pitch list
pitches=get_pitch_list(y,sr)
# Fundamental frequency (F0)
f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
fundamental_frequency = np.nanmean(f0)
# Spectral energy (based on STFT)
S = np.abs(librosa.stft(y))
spectral_energy = np.mean(np.sum(S ** 2, axis=0))
# Spectral centroid
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
avg_spectral_centroid = np.mean(spectral_centroid)
# Zero-crossing rate
zcr = librosa.feature.zero_crossing_rate(y)
zero_crossing_rate = np.mean(zcr)
# Pause detection
silence_threshold = -40
silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
pause_duration = 0
for start, end in silent_intervals:
pause_duration += (end - start) / sr
total_duration = librosa.get_duration(y=y, sr=sr)
pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
# Transcript processing
words = nltk.word_tokenize(transcript)
words = [word.lower() for word in words if word not in string.punctuation]
num_words = len(words)
unique_words = len(set(words))
word_frequencies = Counter(words)
# Duration in minutes
duration_minutes = total_duration / 60
avg_words_per_minute = num_words / duration_minutes
avg_unique_words_per_minute = unique_words / duration_minutes
# Filler word detection
filler_words = [
'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
'totally', 'honestly', 'seriously', 'alright'
]
filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
filler_words_per_minute = filler_word_count / duration_minutes
# POS tagging
pos_tags = nltk.pos_tag(words)
nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
# Sentiment analysis
sentiment = sentipipe(transcript)
sentiment_mapping = {
"LABEL_0": "Negative",
"LABEL_1": "Neutral",
"LABEL_2": "Positive"
}
sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]
# Generate Word Cloud and Save it as an Image
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)
# Save the Word Cloud to the provided path
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig(wordcloud_path, format='png')
plt.close()
print("Nouns: ", nouns)
print("Adjectives: ", adjectives)
print("Verbs: ", verbs)
print("Sentiment: ", sentiment)
return {
"transcript": transcript,
"sentiment": sentiment,
"sound_intensity": float(sound_intensity),
"fundamental_frequency": float(fundamental_frequency),
"spectral_energy": float(spectral_energy),
"spectral_centroid": float(avg_spectral_centroid),
"zero_crossing_rate": float(zero_crossing_rate),
"avg_words_per_minute": float(avg_words_per_minute),
"avg_unique_words_per_minute": float(avg_unique_words_per_minute),
"unique_word_count": int(unique_words),
"filler_words_per_minute": float(filler_words_per_minute),
"noun_count": len(nouns),
"adjective_count": len(adjectives),
"verb_count": len(verbs),
"pause_rate": float(pause_rate)
},pitches
|