Spaces:
Sleeping
Sleeping
| import librosa | |
| import numpy as np | |
| import torch | |
| from collections import Counter | |
| import nltk | |
| import string | |
| import matplotlib.pyplot as plt | |
| from wordcloud import WordCloud | |
| nltk.download('punkt') | |
| nltk.download('punkt_tab') | |
| nltk.download('averaged_perceptron_tagger_eng') | |
| nltk.download('averaged_perceptron_tagger') | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| def get_pitch_list(y,sr): | |
| hop_length = int(sr / 30) # hop_length determines how far apart the frames are | |
| # Extract the pitch (F0) using librosa's piptrack method | |
| pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length) | |
| # Get the pitch frequencies from the pitch array | |
| pitch_frequencies = [] | |
| for t in range(pitches.shape[1]): | |
| index = magnitudes[:, t].argmax() # Get the index of the maximum magnitude | |
| pitch = pitches[index, t] | |
| pitch_frequencies.append(pitch) | |
| # Convert pitch_frequencies to a NumPy array | |
| pitch_frequencies = np.array(pitch_frequencies) | |
| print("shape : ",pitch_frequencies.shape) | |
| return pitch_frequencies | |
| def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path): | |
| y, sr = librosa.load(audio_path, sr=16000) | |
| inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features | |
| inputs = inputs.to(device, dtype=torch_dtype) | |
| with torch.no_grad(): | |
| generated_ids = asrmodel.generate(inputs) | |
| transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # Sound intensity (RMS) | |
| rms = librosa.feature.rms(y=y) | |
| sound_intensity = np.mean(rms) | |
| # Pitch list | |
| pitches=get_pitch_list(y,sr) | |
| # Fundamental frequency (F0) | |
| f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) | |
| fundamental_frequency = np.nanmean(f0) | |
| # Spectral energy (based on STFT) | |
| S = np.abs(librosa.stft(y)) | |
| spectral_energy = np.mean(np.sum(S ** 2, axis=0)) | |
| # Spectral centroid | |
| spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr) | |
| avg_spectral_centroid = np.mean(spectral_centroid) | |
| # Zero-crossing rate | |
| zcr = librosa.feature.zero_crossing_rate(y) | |
| zero_crossing_rate = np.mean(zcr) | |
| # Pause detection | |
| silence_threshold = -40 | |
| silent_intervals = librosa.effects.split(y, top_db=silence_threshold) | |
| pause_duration = 0 | |
| for start, end in silent_intervals: | |
| pause_duration += (end - start) / sr | |
| total_duration = librosa.get_duration(y=y, sr=sr) | |
| pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute | |
| # Transcript processing | |
| words = nltk.word_tokenize(transcript) | |
| words = [word.lower() for word in words if word not in string.punctuation] | |
| num_words = len(words) | |
| unique_words = len(set(words)) | |
| word_frequencies = Counter(words) | |
| # Duration in minutes | |
| duration_minutes = total_duration / 60 | |
| avg_words_per_minute = num_words / duration_minutes | |
| avg_unique_words_per_minute = unique_words / duration_minutes | |
| # Filler word detection | |
| filler_words = [ | |
| 'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so', | |
| 'I mean', 'okay', 'right', 'actually', 'basically', 'you see', | |
| 'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess', | |
| 'totally', 'honestly', 'seriously', 'alright' | |
| ] | |
| filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words]) | |
| filler_words_per_minute = filler_word_count / duration_minutes | |
| # POS tagging | |
| pos_tags = nltk.pos_tag(words) | |
| nouns = [word for word, pos in pos_tags if pos.startswith('NN')] | |
| adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')] | |
| verbs = [word for word, pos in pos_tags if pos.startswith('VB')] | |
| # Sentiment analysis | |
| sentiment = sentipipe(transcript) | |
| sentiment_mapping = { | |
| "LABEL_0": "Negative", | |
| "LABEL_1": "Neutral", | |
| "LABEL_2": "Positive" | |
| } | |
| sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']] | |
| # Generate Word Cloud and Save it as an Image | |
| wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies) | |
| # Save the Word Cloud to the provided path | |
| plt.figure(figsize=(10, 5)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| plt.savefig(wordcloud_path, format='png') | |
| plt.close() | |
| print("Nouns: ", nouns) | |
| print("Adjectives: ", adjectives) | |
| print("Verbs: ", verbs) | |
| print("Sentiment: ", sentiment) | |
| return { | |
| "transcript": transcript, | |
| "sentiment": sentiment, | |
| "sound_intensity": float(sound_intensity), | |
| "fundamental_frequency": float(fundamental_frequency), | |
| "spectral_energy": float(spectral_energy), | |
| "spectral_centroid": float(avg_spectral_centroid), | |
| "zero_crossing_rate": float(zero_crossing_rate), | |
| "avg_words_per_minute": float(avg_words_per_minute), | |
| "avg_unique_words_per_minute": float(avg_unique_words_per_minute), | |
| "unique_word_count": int(unique_words), | |
| "filler_words_per_minute": float(filler_words_per_minute), | |
| "noun_count": len(nouns), | |
| "adjective_count": len(adjectives), | |
| "verb_count": len(verbs), | |
| "pause_rate": float(pause_rate) | |
| },pitches | |