Spaces:

cruvss
/

Fast_api

Sleeping

App Files Files Community

Fast_api / filler_count /filler_score.py

mulasagg

API optimizations

aef3b1e about 1 year ago

raw

history blame contribute delete

2.95 kB

	import re
	import whisper
	from pydub import AudioSegment # For accurate duration calculation

	def analyze_fillers(file_path: str, model_size: str = "base", transcript = None ) -> dict:
	"""
	Analyzes English filler words in audio with proper duration handling.
	"""
	try:
	FILLER_WORDS = [
	"um", "uh", "hmm", "ah", "er", "eh",
	"umm", "uhh", "mmm", "ahh", "err",
	"like", "you know", "well", "so", "actually", "basically",
	"right", "okay", "sort of", "kind of"
	]

	# First get accurate duration using pydub
	audio = AudioSegment.from_file(file_path)
	duration = len(audio) / 1000 # Convert ms to seconds

	if transcript is None:
	# Then run Whisper transcription
	model = whisper.load_model(model_size)
	result = model.transcribe(file_path, word_timestamps=False, fp16=False)
	transcript = result["text"]

	# Case-insensitive regex matching
	pattern = r"(?<!\w)(" + "\|".join(map(re.escape, FILLER_WORDS)) + r")(?!\w)"
	matches = re.findall(pattern, transcript, re.IGNORECASE)

	# Count occurrences
	filler_counts = {}
	for word in matches:
	key = word.lower()
	filler_counts[key] = filler_counts.get(key, 0) + 1
	total_fillers = sum(filler_counts.values())

	# Calculate rate per minute
	filler_per_min = (total_fillers / duration) * 60 if duration > 0 else 0

	# Scoring
	if total_fillers == 0:
	filler_score = 100
	elif filler_per_min < 1:
	filler_score = 90
	elif filler_per_min < 3:
	filler_score = 80
	elif filler_per_min < 5:
	filler_score = 60
	elif filler_per_min < 10:
	filler_score = 40
	else:
	filler_score = 20

	# Generate insight
	top_fillers = sorted(filler_counts.items(), key=lambda x: x[1], reverse=True)[:2]

	if total_fillers == 0:
	insight = "Excellent! No filler words detected."
	elif total_fillers <= 2:
	insight = f"Minimal fillers ({total_fillers} total), mostly '{top_fillers[0][0]}'."
	elif total_fillers <= 5:
	examples = ", ".join(f"'{f[0]}'" for f in top_fillers)
	insight = f"Moderate fillers ({total_fillers} total), mainly {examples}."
	else:
	examples = ", ".join(f"'{f[0]}'" for f in top_fillers)
	insight = f"Excessive fillers ({total_fillers} total), dominated by {examples}."

	return {
	"filler_counts": filler_counts,
	"total_fillers": total_fillers,
	"filler_score": filler_score,
	"filler_rate_per_min": round(filler_per_min, 1),
	}

	except Exception as e:
	raise RuntimeError(f"Analysis failed: {str(e)}")