Spaces:

Rahulk2197
/

Interview

Sleeping

App Files Files Community

Rahulk2197 commited on Oct 12, 2024

Commit

92724d0

verified ·

1 Parent(s): e272f8f

Update functions/audio.py

Browse files

Files changed (1) hide show

functions/audio.py +98 -96

functions/audio.py CHANGED Viewed

@@ -1,97 +1,99 @@
-import librosa
-import numpy as np
-import torch
-from collections import Counter
-import nltk
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-def extract_audio_features(audio_path,asrmodel,asrproc,sentipipe):
-    y, sr = librosa.load(audio_path,sr=16000)
-    inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
-    inputs = inputs.to(device, dtype=torch_dtype)
-    with torch.no_grad():
-        generated_ids = asrmodel.generate(inputs)
-        transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    # Sound intensity (RMS)
-    rms = librosa.feature.rms(y=y)
-    sound_intensity = np.mean(rms)
-    # Fundamental frequency (F0)
-    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
-    fundamental_frequency = np.nanmean(f0)
-    # Spectral energy (based on STFT)
-    S = np.abs(librosa.stft(y))
-    spectral_energy = np.mean(np.sum(S ** 2, axis=0))
-    # Spectral centroid
-    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
-    avg_spectral_centroid = np.mean(spectral_centroid)
-    # Zero-crossing rate
-    zcr = librosa.feature.zero_crossing_rate(y)
-    zero_crossing_rate = np.mean(zcr)
-    # Pause detection
-    silence_threshold = -40
-    silent_intervals = librosa.effects.split(y, top_db=silence_threshold)  # Split into non-silent intervals
-    pause_duration = 0
-    for start, end in silent_intervals:
-        pause_duration += (end - start) / sr  # Add the pause duration in seconds
-    total_duration = librosa.get_duration(y=y, sr=sr)
-    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute
-    # Transcript processing
-    words = nltk.word_tokenize(transcript)
-    num_words = len(words)
-    unique_words = len(set(words))
-    word_frequencies = Counter(words)
-    duration_minutes = total_duration / 60
-    avg_words_per_minute = num_words / duration_minutes
-    avg_unique_words_per_minute = unique_words / duration_minutes
-    # Count of unique words
-    unique_word_count = unique_words
-    # Filler word detection
-    filler_words = [
-        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
-        'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
-        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
-        'totally', 'honestly', 'seriously', 'alright'
-    ]
-    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
-    filler_words_per_minute = filler_word_count / duration_minutes
-    # POS tagging
-    pos_tags = nltk.pos_tag(words)
-    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
-    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
-    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
-    # Sentiment analysis
-    sentiment = sentipipe(transcript)
-    print("Nouns: ", nouns)
-    print("Adjectives: ", adjectives)
-    print("Verbs: ", verbs)
-    return {
-    "transcript": transcript,  # assuming this is a string
-    "sentiment":sentiment,
-    "sound_intensity": float(sound_intensity),  # convert numpy float to Python float
-    "fundamental_frequency": float(fundamental_frequency),  # same conversion
-    "spectral_energy": float(spectral_energy),  # convert to Python float
-    "spectral_centroid": float(avg_spectral_centroid),  # convert numpy float
-    "zero_crossing_rate": float(zero_crossing_rate),  # convert to Python float
-    "avg_words_per_minute": float(avg_words_per_minute),  # same conversion
-    "avg_unique_words_per_minute": float(avg_unique_words_per_minute),  # convert float
-    "unique_word_count": int(unique_word_count),  # convert to integer if needed
-    "filler_words_per_minute": float(filler_words_per_minute),  # convert float
-    "noun_count": len(nouns),  # Assuming nouns is a list, so no changes needed
-    "adjective_count": len(adjectives),  # Same here
-    "verb_count": len(verbs),  # Same here
-    "pause_rate": float(pause_rate), # convert to Python float
 }

+import librosa
+import numpy as np
+import torch
+from collections import Counter
+import nltk
+nltk.download('punkt_tab')
+nltk.download('averaged_perceptron_tagger_eng')
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+def extract_audio_features(audio_path,asrmodel,asrproc,sentipipe):
+    y, sr = librosa.load(audio_path,sr=16000)
+    inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
+    inputs = inputs.to(device, dtype=torch_dtype)
+    with torch.no_grad():
+        generated_ids = asrmodel.generate(inputs)
+        transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Sound intensity (RMS)
+    rms = librosa.feature.rms(y=y)
+    sound_intensity = np.mean(rms)
+    # Fundamental frequency (F0)
+    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
+    fundamental_frequency = np.nanmean(f0)
+    # Spectral energy (based on STFT)
+    S = np.abs(librosa.stft(y))
+    spectral_energy = np.mean(np.sum(S ** 2, axis=0))
+    # Spectral centroid
+    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+    avg_spectral_centroid = np.mean(spectral_centroid)
+    # Zero-crossing rate
+    zcr = librosa.feature.zero_crossing_rate(y)
+    zero_crossing_rate = np.mean(zcr)
+    # Pause detection
+    silence_threshold = -40
+    silent_intervals = librosa.effects.split(y, top_db=silence_threshold)  # Split into non-silent intervals
+    pause_duration = 0
+    for start, end in silent_intervals:
+        pause_duration += (end - start) / sr  # Add the pause duration in seconds
+    total_duration = librosa.get_duration(y=y, sr=sr)
+    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute
+    # Transcript processing
+    words = nltk.word_tokenize(transcript)
+    num_words = len(words)
+    unique_words = len(set(words))
+    word_frequencies = Counter(words)
+    duration_minutes = total_duration / 60
+    avg_words_per_minute = num_words / duration_minutes
+    avg_unique_words_per_minute = unique_words / duration_minutes
+    # Count of unique words
+    unique_word_count = unique_words
+    # Filler word detection
+    filler_words = [
+        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
+        'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
+        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
+        'totally', 'honestly', 'seriously', 'alright'
+    ]
+    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
+    filler_words_per_minute = filler_word_count / duration_minutes
+    # POS tagging
+    pos_tags = nltk.pos_tag(words)
+    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
+    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
+    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
+    # Sentiment analysis
+    sentiment = sentipipe(transcript)
+    print("Nouns: ", nouns)
+    print("Adjectives: ", adjectives)
+    print("Verbs: ", verbs)
+    return {
+    "transcript": transcript,  # assuming this is a string
+    "sentiment":sentiment,
+    "sound_intensity": float(sound_intensity),  # convert numpy float to Python float
+    "fundamental_frequency": float(fundamental_frequency),  # same conversion
+    "spectral_energy": float(spectral_energy),  # convert to Python float
+    "spectral_centroid": float(avg_spectral_centroid),  # convert numpy float
+    "zero_crossing_rate": float(zero_crossing_rate),  # convert to Python float
+    "avg_words_per_minute": float(avg_words_per_minute),  # same conversion
+    "avg_unique_words_per_minute": float(avg_unique_words_per_minute),  # convert float
+    "unique_word_count": int(unique_word_count),  # convert to integer if needed
+    "filler_words_per_minute": float(filler_words_per_minute),  # convert float
+    "noun_count": len(nouns),  # Assuming nouns is a list, so no changes needed
+    "adjective_count": len(adjectives),  # Same here
+    "verb_count": len(verbs),  # Same here
+    "pause_rate": float(pause_rate), # convert to Python float
 }