Rahulk2197 commited on
Commit
92724d0
·
verified ·
1 Parent(s): e272f8f

Update functions/audio.py

Browse files
Files changed (1) hide show
  1. functions/audio.py +98 -96
functions/audio.py CHANGED
@@ -1,97 +1,99 @@
1
- import librosa
2
- import numpy as np
3
- import torch
4
- from collections import Counter
5
- import nltk
6
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
7
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
8
- def extract_audio_features(audio_path,asrmodel,asrproc,sentipipe):
9
- y, sr = librosa.load(audio_path,sr=16000)
10
- inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
11
- inputs = inputs.to(device, dtype=torch_dtype)
12
- with torch.no_grad():
13
- generated_ids = asrmodel.generate(inputs)
14
- transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
15
- # Sound intensity (RMS)
16
- rms = librosa.feature.rms(y=y)
17
- sound_intensity = np.mean(rms)
18
-
19
- # Fundamental frequency (F0)
20
- f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
21
- fundamental_frequency = np.nanmean(f0)
22
-
23
- # Spectral energy (based on STFT)
24
- S = np.abs(librosa.stft(y))
25
- spectral_energy = np.mean(np.sum(S ** 2, axis=0))
26
-
27
- # Spectral centroid
28
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
29
- avg_spectral_centroid = np.mean(spectral_centroid)
30
-
31
- # Zero-crossing rate
32
- zcr = librosa.feature.zero_crossing_rate(y)
33
- zero_crossing_rate = np.mean(zcr)
34
-
35
- # Pause detection
36
- silence_threshold = -40
37
- silent_intervals = librosa.effects.split(y, top_db=silence_threshold) # Split into non-silent intervals
38
- pause_duration = 0
39
- for start, end in silent_intervals:
40
- pause_duration += (end - start) / sr # Add the pause duration in seconds
41
-
42
- total_duration = librosa.get_duration(y=y, sr=sr)
43
- pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
44
-
45
- # Transcript processing
46
- words = nltk.word_tokenize(transcript)
47
- num_words = len(words)
48
- unique_words = len(set(words))
49
- word_frequencies = Counter(words)
50
-
51
- duration_minutes = total_duration / 60
52
- avg_words_per_minute = num_words / duration_minutes
53
- avg_unique_words_per_minute = unique_words / duration_minutes
54
-
55
- # Count of unique words
56
- unique_word_count = unique_words
57
-
58
- # Filler word detection
59
- filler_words = [
60
- 'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
61
- 'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
62
- 'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
63
- 'totally', 'honestly', 'seriously', 'alright'
64
- ]
65
- filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
66
- filler_words_per_minute = filler_word_count / duration_minutes
67
-
68
- # POS tagging
69
- pos_tags = nltk.pos_tag(words)
70
- nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
71
- adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
72
- verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
73
-
74
- # Sentiment analysis
75
- sentiment = sentipipe(transcript)
76
-
77
- print("Nouns: ", nouns)
78
- print("Adjectives: ", adjectives)
79
- print("Verbs: ", verbs)
80
-
81
- return {
82
- "transcript": transcript, # assuming this is a string
83
- "sentiment":sentiment,
84
- "sound_intensity": float(sound_intensity), # convert numpy float to Python float
85
- "fundamental_frequency": float(fundamental_frequency), # same conversion
86
- "spectral_energy": float(spectral_energy), # convert to Python float
87
- "spectral_centroid": float(avg_spectral_centroid), # convert numpy float
88
- "zero_crossing_rate": float(zero_crossing_rate), # convert to Python float
89
- "avg_words_per_minute": float(avg_words_per_minute), # same conversion
90
- "avg_unique_words_per_minute": float(avg_unique_words_per_minute), # convert float
91
- "unique_word_count": int(unique_word_count), # convert to integer if needed
92
- "filler_words_per_minute": float(filler_words_per_minute), # convert float
93
- "noun_count": len(nouns), # Assuming nouns is a list, so no changes needed
94
- "adjective_count": len(adjectives), # Same here
95
- "verb_count": len(verbs), # Same here
96
- "pause_rate": float(pause_rate), # convert to Python float
 
 
97
  }
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ from collections import Counter
5
+ import nltk
6
+ nltk.download('punkt_tab')
7
+ nltk.download('averaged_perceptron_tagger_eng')
8
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
10
+ def extract_audio_features(audio_path,asrmodel,asrproc,sentipipe):
11
+ y, sr = librosa.load(audio_path,sr=16000)
12
+ inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
13
+ inputs = inputs.to(device, dtype=torch_dtype)
14
+ with torch.no_grad():
15
+ generated_ids = asrmodel.generate(inputs)
16
+ transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
17
+ # Sound intensity (RMS)
18
+ rms = librosa.feature.rms(y=y)
19
+ sound_intensity = np.mean(rms)
20
+
21
+ # Fundamental frequency (F0)
22
+ f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
23
+ fundamental_frequency = np.nanmean(f0)
24
+
25
+ # Spectral energy (based on STFT)
26
+ S = np.abs(librosa.stft(y))
27
+ spectral_energy = np.mean(np.sum(S ** 2, axis=0))
28
+
29
+ # Spectral centroid
30
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
31
+ avg_spectral_centroid = np.mean(spectral_centroid)
32
+
33
+ # Zero-crossing rate
34
+ zcr = librosa.feature.zero_crossing_rate(y)
35
+ zero_crossing_rate = np.mean(zcr)
36
+
37
+ # Pause detection
38
+ silence_threshold = -40
39
+ silent_intervals = librosa.effects.split(y, top_db=silence_threshold) # Split into non-silent intervals
40
+ pause_duration = 0
41
+ for start, end in silent_intervals:
42
+ pause_duration += (end - start) / sr # Add the pause duration in seconds
43
+
44
+ total_duration = librosa.get_duration(y=y, sr=sr)
45
+ pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute
46
+
47
+ # Transcript processing
48
+ words = nltk.word_tokenize(transcript)
49
+ num_words = len(words)
50
+ unique_words = len(set(words))
51
+ word_frequencies = Counter(words)
52
+
53
+ duration_minutes = total_duration / 60
54
+ avg_words_per_minute = num_words / duration_minutes
55
+ avg_unique_words_per_minute = unique_words / duration_minutes
56
+
57
+ # Count of unique words
58
+ unique_word_count = unique_words
59
+
60
+ # Filler word detection
61
+ filler_words = [
62
+ 'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
63
+ 'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
64
+ 'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
65
+ 'totally', 'honestly', 'seriously', 'alright'
66
+ ]
67
+ filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
68
+ filler_words_per_minute = filler_word_count / duration_minutes
69
+
70
+ # POS tagging
71
+ pos_tags = nltk.pos_tag(words)
72
+ nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
73
+ adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
74
+ verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
75
+
76
+ # Sentiment analysis
77
+ sentiment = sentipipe(transcript)
78
+
79
+ print("Nouns: ", nouns)
80
+ print("Adjectives: ", adjectives)
81
+ print("Verbs: ", verbs)
82
+
83
+ return {
84
+ "transcript": transcript, # assuming this is a string
85
+ "sentiment":sentiment,
86
+ "sound_intensity": float(sound_intensity), # convert numpy float to Python float
87
+ "fundamental_frequency": float(fundamental_frequency), # same conversion
88
+ "spectral_energy": float(spectral_energy), # convert to Python float
89
+ "spectral_centroid": float(avg_spectral_centroid), # convert numpy float
90
+ "zero_crossing_rate": float(zero_crossing_rate), # convert to Python float
91
+ "avg_words_per_minute": float(avg_words_per_minute), # same conversion
92
+ "avg_unique_words_per_minute": float(avg_unique_words_per_minute), # convert float
93
+ "unique_word_count": int(unique_word_count), # convert to integer if needed
94
+ "filler_words_per_minute": float(filler_words_per_minute), # convert float
95
+ "noun_count": len(nouns), # Assuming nouns is a list, so no changes needed
96
+ "adjective_count": len(adjectives), # Same here
97
+ "verb_count": len(verbs), # Same here
98
+ "pause_rate": float(pause_rate), # convert to Python float
99
  }