Spaces:

Rahulk2197
/

Interview

Sleeping

App Files Files Community

Interview / functions /audio.py

Rahulk2197

Update functions/audio.py

e61da93 verified over 1 year ago

raw

history blame contribute delete

5.44 kB

	import librosa
	import numpy as np
	import torch
	from collections import Counter
	import nltk
	import string
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud

	nltk.download('punkt')
	nltk.download('punkt_tab')
	nltk.download('averaged_perceptron_tagger_eng')
	nltk.download('averaged_perceptron_tagger')
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	def get_pitch_list(y,sr):
	hop_length = int(sr / 30) # hop_length determines how far apart the frames are

	# Extract the pitch (F0) using librosa's piptrack method
	pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)

	# Get the pitch frequencies from the pitch array
	pitch_frequencies = []

	for t in range(pitches.shape[1]):
	index = magnitudes[:, t].argmax() # Get the index of the maximum magnitude
	pitch = pitches[index, t]

	pitch_frequencies.append(pitch)

	# Convert pitch_frequencies to a NumPy array
	pitch_frequencies = np.array(pitch_frequencies)
	print("shape : ",pitch_frequencies.shape)
	return pitch_frequencies


	def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
	y, sr = librosa.load(audio_path, sr=16000)
	inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
	inputs = inputs.to(device, dtype=torch_dtype)
	with torch.no_grad():
	generated_ids = asrmodel.generate(inputs)
	transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]

	# Sound intensity (RMS)
	rms = librosa.feature.rms(y=y)
	sound_intensity = np.mean(rms)

	# Pitch list
	pitches=get_pitch_list(y,sr)

	# Fundamental frequency (F0)
	f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
	fundamental_frequency = np.nanmean(f0)

	# Spectral energy (based on STFT)
	S = np.abs(librosa.stft(y))
	spectral_energy = np.mean(np.sum(S ** 2, axis=0))

	# Spectral centroid
	spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
	avg_spectral_centroid = np.mean(spectral_centroid)

	# Zero-crossing rate
	zcr = librosa.feature.zero_crossing_rate(y)
	zero_crossing_rate = np.mean(zcr)

	# Pause detection
	silence_threshold = -40
	silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
	pause_duration = 0
	for start, end in silent_intervals:
	pause_duration += (end - start) / sr

	total_duration = librosa.get_duration(y=y, sr=sr)
	pause_rate = (pause_duration / total_duration) * 60 # Convert to pauses per minute

	# Transcript processing
	words = nltk.word_tokenize(transcript)
	words = [word.lower() for word in words if word not in string.punctuation]
	num_words = len(words)
	unique_words = len(set(words))
	word_frequencies = Counter(words)

	# Duration in minutes
	duration_minutes = total_duration / 60
	avg_words_per_minute = num_words / duration_minutes
	avg_unique_words_per_minute = unique_words / duration_minutes

	# Filler word detection
	filler_words = [
	'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so',
	'I mean', 'okay', 'right', 'actually', 'basically', 'you see',
	'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess',
	'totally', 'honestly', 'seriously', 'alright'
	]
	filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
	filler_words_per_minute = filler_word_count / duration_minutes

	# POS tagging
	pos_tags = nltk.pos_tag(words)
	nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
	adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
	verbs = [word for word, pos in pos_tags if pos.startswith('VB')]

	# Sentiment analysis
	sentiment = sentipipe(transcript)
	sentiment_mapping = {
	"LABEL_0": "Negative",
	"LABEL_1": "Neutral",
	"LABEL_2": "Positive"
	}
	sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]

	# Generate Word Cloud and Save it as an Image
	wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)

	# Save the Word Cloud to the provided path
	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis('off')
	plt.savefig(wordcloud_path, format='png')
	plt.close()

	print("Nouns: ", nouns)
	print("Adjectives: ", adjectives)
	print("Verbs: ", verbs)
	print("Sentiment: ", sentiment)

	return {
	"transcript": transcript,
	"sentiment": sentiment,
	"sound_intensity": float(sound_intensity),
	"fundamental_frequency": float(fundamental_frequency),
	"spectral_energy": float(spectral_energy),
	"spectral_centroid": float(avg_spectral_centroid),
	"zero_crossing_rate": float(zero_crossing_rate),
	"avg_words_per_minute": float(avg_words_per_minute),
	"avg_unique_words_per_minute": float(avg_unique_words_per_minute),
	"unique_word_count": int(unique_words),
	"filler_words_per_minute": float(filler_words_per_minute),
	"noun_count": len(nouns),
	"adjective_count": len(adjectives),
	"verb_count": len(verbs),
	"pause_rate": float(pause_rate)
	},pitches