Spaces:

mulasagg
/

Voice

Sleeping

App Files Files Community

Voice / vcs /vcs.py

mulasagg

first

8031a8f about 1 year ago

raw

history blame contribute delete

6.64 kB

	"""
	Voice Clarity Score calculation module
	"""

	import librosa
	import numpy as np
	from typing import Dict, Any, List
	import soundfile as sf

	def calculate_articulation(y: np.ndarray, sr: int) -> float:
	"""
	Calculate articulation quality based on spectral contrast.

	Articulation refers to how clearly individual phonemes are produced.

	Args:
	y (np.ndarray): Audio signal
	sr (int): Sample rate

	Returns:
	float: Articulation score (0-100)
	"""
	# Extract spectral contrast
	# Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation
	S = np.abs(librosa.stft(y))
	contrast = librosa.feature.spectral_contrast(S=S, sr=sr)

	# Average across frequency bands and frames
	mean_contrast = np.mean(contrast)

	# Normalize to 0-100 scale (empirically determined range)
	# Typical values range from 10-50 dB
	min_contrast = 10
	max_contrast = 50
	normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100))

	return normalized_contrast

	def calculate_enunciation(y: np.ndarray, sr: int) -> float:
	"""
	Calculate enunciation quality based on formant clarity and spectral flatness.

	Enunciation is the precision in pronouncing vowels and consonants.

	Args:
	y (np.ndarray): Audio signal
	sr (int): Sample rate

	Returns:
	float: Enunciation score (0-100)
	"""
	# Compute spectral flatness - lower values indicate clearer formants and better enunciation
	flatness = np.mean(librosa.feature.spectral_flatness(y=y))

	# Compute spectral centroid - related to "brightness" or articulation clarity
	centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))

	# Normalize flatness (lower is better for speech) - range typically 0.01-0.5
	norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100))

	# Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech
	ideal_centroid = 2500 # Hz
	centroid_deviation = abs(centroid - ideal_centroid) / 2000 # Normalized by expected deviation
	norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100))

	# Combine the two metrics (with more weight on flatness)
	enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid)

	return enunciation_score

	def calculate_speech_pause_control(segments: List[Dict]) -> float:
	"""
	Calculate how effectively pauses are integrated in speech.

	Speech pause control refers to the natural vs. abrupt pauses in speech.

	Args:
	segments (List[Dict]): List of transcript segments with timing information

	Returns:
	float: Speech pause control score (0-100)
	"""
	if len(segments) < 2:
	return 100.0 # Not enough segments to evaluate pauses

	pause_durations = []
	for i in range(len(segments) - 1):
	pause_dur = segments[i + 1]["start"] - segments[i]["end"]
	if pause_dur > 0.05: # Only consider actual pauses
	pause_durations.append(pause_dur)

	if not pause_durations:
	return 100.0 # No significant pauses detected

	# Calculate the standard deviation of pause durations
	# More consistent pauses indicate better control
	pause_std = np.std(pause_durations)

	# Calculate proportion of very long pauses (potentially awkward)
	long_pauses = sum(1 for d in pause_durations if d > 2.0)
	long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0

	# Normalize std dev (lower is better, but not too low)
	# Ideal range is around 0.2-0.5 seconds
	if pause_std < 0.1:
	std_score = 70 # Too consistent might sound robotic
	elif pause_std < 0.5:
	std_score = 100 - ((pause_std - 0.1) / 0.4 * 30) # Scale 70-100
	else:
	std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70)) # Scale down from 70

	# Penalize for too many long pauses
	long_pause_penalty = long_pause_ratio * 50

	# Final score
	pause_control_score = max(0, min(100, std_score - long_pause_penalty))

	return pause_control_score

	def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]:
	"""
	Calculate the Voice Clarity Score (VCS) and its components.

	VCS reflects the clarity and intelligibility of speech.

	Args:
	y (np.ndarray): Audio signal
	sr (int): Sample rate
	segments (List[Dict]): List of transcript segments with timing information

	Returns:
	Dict[str, Any]: Dictionary with VCS and component scores
	"""
	# Calculate component scores
	articulation_score = calculate_articulation(y, sr)
	enunciation_score = calculate_enunciation(y, sr)
	speech_pause_control_score = calculate_speech_pause_control(segments)

	# Calculate Voice Clarity Score using the formula from the paper
	vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score)

	# Create result dictionary
	result = {
	"VCS": vcs,
	"components": {
	"articulation": articulation_score,
	"enunciation": enunciation_score,
	"speech_pause_control": speech_pause_control_score
	}
	}

	# Add interpretation
	result["insight"] = get_clarity_insight(vcs)

	return result

	def get_clarity_insight(vcs: float) -> str:
	"""
	Generate insight text based on the Voice Clarity Score.

	Args:
	vcs (float): Voice Clarity Score (0-100)

	Returns:
	str: Insight text explaining the score
	"""
	if vcs >= 85:
	return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to."
	elif vcs >= 70:
	return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity."
	elif vcs >= 50:
	return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing."
	elif vcs >= 30:
	return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity."
	else:
	return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial."