File size: 5,442 Bytes
e61da93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import librosa
import numpy as np
import torch
from collections import Counter
import nltk
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

def get_pitch_list(y,sr):
    hop_length = int(sr / 30)  # hop_length determines how far apart the frames are

    # Extract the pitch (F0) using librosa's piptrack method
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)

    # Get the pitch frequencies from the pitch array
    pitch_frequencies = []

    for t in range(pitches.shape[1]):
        index = magnitudes[:, t].argmax()  # Get the index of the maximum magnitude
        pitch = pitches[index, t]
        
        pitch_frequencies.append(pitch)

    # Convert pitch_frequencies to a NumPy array
    pitch_frequencies = np.array(pitch_frequencies)
    print("shape : ",pitch_frequencies.shape)
    return pitch_frequencies


def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
    y, sr = librosa.load(audio_path, sr=16000)
    inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
    inputs = inputs.to(device, dtype=torch_dtype)
    with torch.no_grad():
        generated_ids = asrmodel.generate(inputs)
        transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Sound intensity (RMS)
    rms = librosa.feature.rms(y=y)
    sound_intensity = np.mean(rms)

    # Pitch list
    pitches=get_pitch_list(y,sr)

    # Fundamental frequency (F0)
    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    fundamental_frequency = np.nanmean(f0)

    # Spectral energy (based on STFT)
    S = np.abs(librosa.stft(y))
    spectral_energy = np.mean(np.sum(S ** 2, axis=0))

    # Spectral centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    avg_spectral_centroid = np.mean(spectral_centroid)

    # Zero-crossing rate
    zcr = librosa.feature.zero_crossing_rate(y)
    zero_crossing_rate = np.mean(zcr)

    # Pause detection
    silence_threshold = -40
    silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
    pause_duration = 0
    for start, end in silent_intervals:
        pause_duration += (end - start) / sr

    total_duration = librosa.get_duration(y=y, sr=sr)
    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute

    # Transcript processing
    words = nltk.word_tokenize(transcript)
    words = [word.lower() for word in words if word not in string.punctuation]
    num_words = len(words)
    unique_words = len(set(words))
    word_frequencies = Counter(words)

    # Duration in minutes
    duration_minutes = total_duration / 60
    avg_words_per_minute = num_words / duration_minutes
    avg_unique_words_per_minute = unique_words / duration_minutes

    # Filler word detection
    filler_words = [
        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so', 
        'I mean', 'okay', 'right', 'actually', 'basically', 'you see', 
        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess', 
        'totally', 'honestly', 'seriously', 'alright'
    ]
    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
    filler_words_per_minute = filler_word_count / duration_minutes

    # POS tagging
    pos_tags = nltk.pos_tag(words)
    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]

    # Sentiment analysis
    sentiment = sentipipe(transcript)
    sentiment_mapping = {
        "LABEL_0": "Negative",
        "LABEL_1": "Neutral",
        "LABEL_2": "Positive"
    }
    sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]

    # Generate Word Cloud and Save it as an Image
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)

    # Save the Word Cloud to the provided path
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig(wordcloud_path, format='png')
    plt.close()

    print("Nouns: ", nouns)
    print("Adjectives: ", adjectives)
    print("Verbs: ", verbs)
    print("Sentiment: ", sentiment)

    return {
        "transcript": transcript,
        "sentiment": sentiment,
        "sound_intensity": float(sound_intensity),
        "fundamental_frequency": float(fundamental_frequency),
        "spectral_energy": float(spectral_energy),
        "spectral_centroid": float(avg_spectral_centroid),
        "zero_crossing_rate": float(zero_crossing_rate),
        "avg_words_per_minute": float(avg_words_per_minute),
        "avg_unique_words_per_minute": float(avg_unique_words_per_minute),
        "unique_word_count": int(unique_words),
        "filler_words_per_minute": float(filler_words_per_minute),
        "noun_count": len(nouns),
        "adjective_count": len(adjectives),
        "verb_count": len(verbs),
        "pause_rate": float(pause_rate)
    },pitches